aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/Kconfig3
-rw-r--r--drivers/infiniband/core/Makefile14
-rw-r--r--drivers/infiniband/core/addr.c226
-rw-r--r--drivers/infiniband/core/cache.c14
-rw-r--r--drivers/infiniband/core/cm.c4
-rw-r--r--drivers/infiniband/core/cma.c176
-rw-r--r--drivers/infiniband/core/core_priv.h16
-rw-r--r--drivers/infiniband/core/device.c71
-rw-r--r--drivers/infiniband/core/iwcm.c58
-rw-r--r--drivers/infiniband/core/iwcm.h2
-rw-r--r--drivers/infiniband/core/iwpm_msg.c2
-rw-r--r--drivers/infiniband/core/iwpm_util.c4
-rw-r--r--drivers/infiniband/core/mad.c19
-rw-r--r--drivers/infiniband/core/mr_pool.c86
-rw-r--r--drivers/infiniband/core/multicast.c24
-rw-r--r--drivers/infiniband/core/netlink.c11
-rw-r--r--drivers/infiniband/core/rw.c725
-rw-r--r--drivers/infiniband/core/sa_query.c254
-rw-r--r--drivers/infiniband/core/sysfs.c393
-rw-r--r--drivers/infiniband/core/ucma.c18
-rw-r--r--drivers/infiniband/core/umem.c7
-rw-r--r--drivers/infiniband/core/uverbs.h14
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c548
-rw-r--r--drivers/infiniband/core/uverbs_main.c75
-rw-r--r--drivers/infiniband/core/verbs.c356
-rw-r--r--drivers/infiniband/hw/Makefile1
-rw-r--r--drivers/infiniband/hw/cxgb3/cxio_hal.c2
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_cm.c6
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_provider.c181
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c794
-rw-r--r--drivers/infiniband/hw/cxgb4/cq.c52
-rw-r--r--drivers/infiniband/hw/cxgb4/device.c7
-rw-r--r--drivers/infiniband/hw/cxgb4/iw_cxgb4.h39
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c139
-rw-r--r--drivers/infiniband/hw/cxgb4/provider.c89
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c40
-rw-r--r--drivers/infiniband/hw/cxgb4/t4.h5
-rw-r--r--drivers/infiniband/hw/hfi1/Kconfig29
-rw-r--r--drivers/infiniband/hw/hfi1/Makefile21
-rw-r--r--drivers/infiniband/hw/hfi1/affinity.c748
-rw-r--r--drivers/infiniband/hw/hfi1/affinity.h132
-rw-r--r--drivers/infiniband/hw/hfi1/aspm.h309
-rw-r--r--drivers/infiniband/hw/hfi1/chip.c14886
-rw-r--r--drivers/infiniband/hw/hfi1/chip.h1372
-rw-r--r--drivers/infiniband/hw/hfi1/chip_registers.h1311
-rw-r--r--drivers/infiniband/hw/hfi1/common.h411
-rw-r--r--drivers/infiniband/hw/hfi1/debugfs.c1121
-rw-r--r--drivers/infiniband/hw/hfi1/debugfs.h75
-rw-r--r--drivers/infiniband/hw/hfi1/device.c183
-rw-r--r--drivers/infiniband/hw/hfi1/device.h60
-rw-r--r--drivers/infiniband/hw/hfi1/dma.c183
-rw-r--r--drivers/infiniband/hw/hfi1/driver.c1409
-rw-r--r--drivers/infiniband/hw/hfi1/efivar.c164
-rw-r--r--drivers/infiniband/hw/hfi1/efivar.h57
-rw-r--r--drivers/infiniband/hw/hfi1/eprom.c102
-rw-r--r--drivers/infiniband/hw/hfi1/eprom.h52
-rw-r--r--drivers/infiniband/hw/hfi1/file_ops.c1514
-rw-r--r--drivers/infiniband/hw/hfi1/firmware.c2181
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h2057
-rw-r--r--drivers/infiniband/hw/hfi1/init.c1836
-rw-r--r--drivers/infiniband/hw/hfi1/intr.c200
-rw-r--r--drivers/infiniband/hw/hfi1/iowait.h300
-rw-r--r--drivers/infiniband/hw/hfi1/mad.c4434
-rw-r--r--drivers/infiniband/hw/hfi1/mad.h432
-rw-r--r--drivers/infiniband/hw/hfi1/mmu_rb.c369
-rw-r--r--drivers/infiniband/hw/hfi1/mmu_rb.h87
-rw-r--r--drivers/infiniband/hw/hfi1/opa_compat.h111
-rw-r--r--drivers/infiniband/hw/hfi1/pcie.c1398
-rw-r--r--drivers/infiniband/hw/hfi1/pio.c2105
-rw-r--r--drivers/infiniband/hw/hfi1/pio.h328
-rw-r--r--drivers/infiniband/hw/hfi1/pio_copy.c879
-rw-r--r--drivers/infiniband/hw/hfi1/platform.c899
-rw-r--r--drivers/infiniband/hw/hfi1/platform.h305
-rw-r--r--drivers/infiniband/hw/hfi1/qp.c1032
-rw-r--r--drivers/infiniband/hw/hfi1/qp.h162
-rw-r--r--drivers/infiniband/hw/hfi1/qsfp.c856
-rw-r--r--drivers/infiniband/hw/hfi1/qsfp.h246
-rw-r--r--drivers/infiniband/hw/hfi1/rc.c2620
-rw-r--r--drivers/infiniband/hw/hfi1/ruc.c1002
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.c3054
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.h1082
-rw-r--r--drivers/infiniband/hw/hfi1/sdma_txreq.h135
-rw-r--r--drivers/infiniband/hw/hfi1/sysfs.c810
-rw-r--r--drivers/infiniband/hw/hfi1/trace.c230
-rw-r--r--drivers/infiniband/hw/hfi1/trace.h53
-rw-r--r--drivers/infiniband/hw/hfi1/trace_ctxts.h141
-rw-r--r--drivers/infiniband/hw/hfi1/trace_dbg.h155
-rw-r--r--drivers/infiniband/hw/hfi1/trace_ibhdrs.h209
-rw-r--r--drivers/infiniband/hw/hfi1/trace_misc.h81
-rw-r--r--drivers/infiniband/hw/hfi1/trace_rc.h123
-rw-r--r--drivers/infiniband/hw/hfi1/trace_rx.h322
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tx.h642
-rw-r--r--drivers/infiniband/hw/hfi1/uc.c595
-rw-r--r--drivers/infiniband/hw/hfi1/ud.c866
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.c1056
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.h79
-rw-r--r--drivers/infiniband/hw/hfi1/user_pages.c136
-rw-r--r--drivers/infiniband/hw/hfi1/user_sdma.c1669
-rw-r--r--drivers/infiniband/hw/hfi1/user_sdma.h84
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c1789
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.h529
-rw-r--r--drivers/infiniband/hw/hfi1/verbs_txreq.c147
-rw-r--r--drivers/infiniband/hw/hfi1/verbs_txreq.h117
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw.h13
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_cm.c178
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_cm.h10
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_ctrl.c185
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_d.h7
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_hw.c15
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_main.c73
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_osdep.h1
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_pble.c9
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_puda.c6
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_status.h1
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_type.h16
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_uk.c135
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_user.h38
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_utils.c52
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_verbs.c523
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_verbs.h3
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_vf.c2
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_vf.h2
-rw-r--r--drivers/infiniband/hw/i40iw/i40iw_virtchnl.c102
-rw-r--r--drivers/infiniband/hw/mlx4/ah.c2
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c44
-rw-r--r--drivers/infiniband/hw/mlx4/mad.c47
-rw-r--r--drivers/infiniband/hw/mlx4/main.c234
-rw-r--r--drivers/infiniband/hw/mlx4/mcg.c23
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h18
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c41
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c81
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c126
-rw-r--r--drivers/infiniband/hw/mlx5/gsi.c19
-rw-r--r--drivers/infiniband/hw/mlx5/mad.c2
-rw-r--r--drivers/infiniband/hw/mlx5/main.c587
-rw-r--r--drivers/infiniband/hw/mlx5/mem.c6
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h83
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c29
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c772
-rw-r--r--drivers/infiniband/hw/mlx5/srq.c112
-rw-r--r--drivers/infiniband/hw/mlx5/user.h88
-rw-r--r--drivers/infiniband/hw/mthca/mthca_provider.c24
-rw-r--r--drivers/infiniband/hw/mthca/mthca_reset.c42
-rw-r--r--drivers/infiniband/hw/nes/nes_nic.c16
-rw-r--r--drivers/infiniband/hw/nes/nes_utils.c60
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.c76
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.h2
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_hw.c14
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_main.c19
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_sli.h12
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.c11
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.h5
-rw-r--r--drivers/infiniband/hw/qib/qib_debugfs.c12
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c10
-rw-r--r--drivers/infiniband/hw/qib/qib_fs.c26
-rw-r--r--drivers/infiniband/hw/qib/qib_iba7322.c15
-rw-r--r--drivers/infiniband/hw/qib/qib_init.c4
-rw-r--r--drivers/infiniband/hw/qib/qib_mad.c6
-rw-r--r--drivers/infiniband/hw/qib/qib_pcie.c6
-rw-r--r--drivers/infiniband/hw/qib/qib_qp.c47
-rw-r--r--drivers/infiniband/hw/qib/qib_rc.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_ruc.c4
-rw-r--r--drivers/infiniband/hw/qib/qib_uc.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_ud.c18
-rw-r--r--drivers/infiniband/hw/qib/qib_verbs.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_verbs.h9
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_main.c19
-rw-r--r--drivers/infiniband/hw/usnic/usnic_ib_sysfs.c17
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c5
-rw-r--r--drivers/infiniband/sw/Makefile1
-rw-r--r--drivers/infiniband/sw/rdmavt/Kconfig1
-rw-r--r--drivers/infiniband/sw/rdmavt/cq.c2
-rw-r--r--drivers/infiniband/sw/rdmavt/mr.c130
-rw-r--r--drivers/infiniband/sw/rdmavt/mr.h2
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.c293
-rw-r--r--drivers/infiniband/sw/rdmavt/vt.c27
-rw-r--r--drivers/infiniband/sw/rxe/Kconfig24
-rw-r--r--drivers/infiniband/sw/rxe/Makefile24
-rw-r--r--drivers/infiniband/sw/rxe/rxe.c405
-rw-r--r--drivers/infiniband/sw/rxe/rxe.h77
-rw-r--r--drivers/infiniband/sw/rxe/rxe_av.c98
-rw-r--r--drivers/infiniband/sw/rxe/rxe_comp.c747
-rw-r--r--drivers/infiniband/sw/rxe/rxe_cq.c165
-rw-r--r--drivers/infiniband/sw/rxe/rxe_dma.c166
-rw-r--r--drivers/infiniband/sw/rxe/rxe_hdr.h952
-rw-r--r--drivers/infiniband/sw/rxe/rxe_icrc.c96
-rw-r--r--drivers/infiniband/sw/rxe/rxe_loc.h286
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mcast.c190
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mmap.c173
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c643
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.c703
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.h56
-rw-r--r--drivers/infiniband/sw/rxe/rxe_opcode.c961
-rw-r--r--drivers/infiniband/sw/rxe/rxe_opcode.h129
-rw-r--r--drivers/infiniband/sw/rxe/rxe_param.h172
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.c502
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.h163
-rw-r--r--drivers/infiniband/sw/rxe/rxe_qp.c851
-rw-r--r--drivers/infiniband/sw/rxe/rxe_queue.c217
-rw-r--r--drivers/infiniband/sw/rxe/rxe_queue.h178
-rw-r--r--drivers/infiniband/sw/rxe/rxe_recv.c420
-rw-r--r--drivers/infiniband/sw/rxe/rxe_req.c757
-rw-r--r--drivers/infiniband/sw/rxe/rxe_resp.c1381
-rw-r--r--drivers/infiniband/sw/rxe/rxe_srq.c193
-rw-r--r--drivers/infiniband/sw/rxe/rxe_sysfs.c157
-rw-r--r--drivers/infiniband/sw/rxe/rxe_task.c154
-rw-r--r--drivers/infiniband/sw/rxe/rxe_task.h95
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.c1330
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.h480
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h6
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c22
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ethtool.c73
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c122
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c154
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c48
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c13
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_vlan.c8
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c4
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.c879
-rw-r--r--drivers/infiniband/ulp/isert/ib_isert.h72
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c242
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.h2
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.c746
-rw-r--r--drivers/infiniband/ulp/srpt/ib_srpt.h36
224 files changed, 83123 insertions, 3432 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 6425c0e5d18a..e9b7dc037ff8 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -84,5 +84,8 @@ source "drivers/infiniband/ulp/iser/Kconfig"
source "drivers/infiniband/ulp/isert/Kconfig"
source "drivers/infiniband/sw/rdmavt/Kconfig"
+source "drivers/infiniband/sw/rxe/Kconfig"
+
+source "drivers/infiniband/hw/hfi1/Kconfig"
endif # INFINIBAND
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index f818538a7f4e..edaae9f9853c 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -1,23 +1,19 @@
infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o
user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o
-obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \
- ib_cm.o iw_cm.o ib_addr.o \
+obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \
$(infiniband-y)
obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
$(user_access-y)
-ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \
+ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o \
- roce_gid_mgmt.o
+ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
+ multicast.o mad.o smi.o agent.o mad_rmpp.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
-ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
-
-ib_sa-y := sa_query.o multicast.o
-
ib_cm-y := cm.o
iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o
@@ -28,8 +24,6 @@ rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
rdma_ucm-y := ucma.o
-ib_addr-y := addr.o
-
ib_umad-y := user_mad.o
ib_ucm-y := ucm.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 337353d86cfa..1374541a4528 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -46,10 +46,10 @@
#include <net/ip6_route.h>
#include <rdma/ib_addr.h>
#include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
-MODULE_AUTHOR("Sean Hefty");
-MODULE_DESCRIPTION("IB Address Translation");
-MODULE_LICENSE("Dual BSD/GPL");
+#include "core_priv.h"
struct addr_req {
struct list_head list;
@@ -62,8 +62,11 @@ struct addr_req {
struct rdma_dev_addr *addr, void *context);
unsigned long timeout;
int status;
+ u32 seq;
};
+static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0);
+
static void process_req(struct work_struct *work);
static DEFINE_MUTEX(lock);
@@ -71,6 +74,126 @@ static LIST_HEAD(req_list);
static DECLARE_DELAYED_WORK(work, process_req);
static struct workqueue_struct *addr_wq;
+static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = {
+ [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+};
+
+static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[LS_NLA_TYPE_MAX] = {};
+ int ret;
+
+ if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+ return false;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_addr_policy);
+ if (ret)
+ return false;
+
+ return true;
+}
+
+static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh)
+{
+ const struct nlattr *head, *curr;
+ union ib_gid gid;
+ struct addr_req *req;
+ int len, rem;
+ int found = 0;
+
+ head = (const struct nlattr *)nlmsg_data(nlh);
+ len = nlmsg_len(nlh);
+
+ nla_for_each_attr(curr, head, len, rem) {
+ if (curr->nla_type == LS_NLA_TYPE_DGID)
+ memcpy(&gid, nla_data(curr), nla_len(curr));
+ }
+
+ mutex_lock(&lock);
+ list_for_each_entry(req, &req_list, list) {
+ if (nlh->nlmsg_seq != req->seq)
+ continue;
+ /* We set the DGID part, the rest was set earlier */
+ rdma_addr_set_dgid(req->addr, &gid);
+ req->status = 0;
+ found = 1;
+ break;
+ }
+ mutex_unlock(&lock);
+
+ if (!found)
+ pr_info("Couldn't find request waiting for DGID: %pI6\n",
+ &gid);
+}
+
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+
+ if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ !(NETLINK_CB(skb).sk) ||
+ !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (ib_nl_is_good_ip_resp(nlh))
+ ib_nl_process_good_ip_rsep(nlh);
+
+ return skb->len;
+}
+
+static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
+ const void *daddr,
+ u32 seq, u16 family)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ struct rdma_ls_ip_resolve_header *header;
+ void *data;
+ size_t size;
+ int attrtype;
+ int len;
+
+ if (family == AF_INET) {
+ size = sizeof(struct in_addr);
+ attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4;
+ } else {
+ size = sizeof(struct in6_addr);
+ attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6;
+ }
+
+ len = nla_total_size(sizeof(size));
+ len += NLMSG_ALIGN(sizeof(*header));
+
+ skb = nlmsg_new(len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS,
+ RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST);
+ if (!data) {
+ nlmsg_free(skb);
+ return -ENODATA;
+ }
+
+ /* Construct the family header first */
+ header = (struct rdma_ls_ip_resolve_header *)
+ skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+ header->ifindex = dev_addr->bound_dev_if;
+ nla_put(skb, attrtype, size, daddr);
+
+ /* Repair the nlmsg header length */
+ nlmsg_end(skb, nlh);
+ ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+
+ /* Make the request retry, so when we get the response from userspace
+ * we will have something.
+ */
+ return -ENODATA;
+}
+
int rdma_addr_size(struct sockaddr *addr)
{
switch (addr->sa_family) {
@@ -199,6 +322,17 @@ static void queue_req(struct addr_req *req)
mutex_unlock(&lock);
}
+static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+ const void *daddr, u32 seq, u16 family)
+{
+ if (ibnl_chk_listeners(RDMA_NL_GROUP_LS))
+ return -EADDRNOTAVAIL;
+
+ /* We fill in what we can, the response will fill the rest */
+ rdma_copy_addr(dev_addr, dst->dev, NULL);
+ return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
+}
+
static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
const void *daddr)
{
@@ -223,6 +357,39 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
return ret;
}
+static bool has_gateway(struct dst_entry *dst, sa_family_t family)
+{
+ struct rtable *rt;
+ struct rt6_info *rt6;
+
+ if (family == AF_INET) {
+ rt = container_of(dst, struct rtable, dst);
+ return rt->rt_uses_gateway;
+ }
+
+ rt6 = container_of(dst, struct rt6_info, dst);
+ return rt6->rt6i_flags & RTF_GATEWAY;
+}
+
+static int fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+ const struct sockaddr *dst_in, u32 seq)
+{
+ const struct sockaddr_in *dst_in4 =
+ (const struct sockaddr_in *)dst_in;
+ const struct sockaddr_in6 *dst_in6 =
+ (const struct sockaddr_in6 *)dst_in;
+ const void *daddr = (dst_in->sa_family == AF_INET) ?
+ (const void *)&dst_in4->sin_addr.s_addr :
+ (const void *)&dst_in6->sin6_addr;
+ sa_family_t family = dst_in->sa_family;
+
+ /* Gateway + ARPHRD_INFINIBAND -> IB router */
+ if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND)
+ return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family);
+ else
+ return dst_fetch_ha(dst, dev_addr, daddr);
+}
+
static int addr4_resolve(struct sockaddr_in *src_in,
const struct sockaddr_in *dst_in,
struct rdma_dev_addr *addr,
@@ -246,10 +413,11 @@ static int addr4_resolve(struct sockaddr_in *src_in,
src_in->sin_family = AF_INET;
src_in->sin_addr.s_addr = fl4.saddr;
- /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
- * routable) and we could set the network type accordingly.
+ /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+ * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+ * type accordingly.
*/
- if (rt->rt_uses_gateway)
+ if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND)
addr->network = RDMA_NETWORK_IPV4;
addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
@@ -291,10 +459,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
src_in->sin6_addr = fl6.saddr;
}
- /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
- * routable) and we could set the network type accordingly.
+ /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're
+ * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network
+ * type accordingly.
*/
- if (rt->rt6i_flags & RTF_GATEWAY)
+ if (rt->rt6i_flags & RTF_GATEWAY &&
+ ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND)
addr->network = RDMA_NETWORK_IPV6;
addr->hoplimit = ip6_dst_hoplimit(dst);
@@ -317,7 +487,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
static int addr_resolve_neigh(struct dst_entry *dst,
const struct sockaddr *dst_in,
- struct rdma_dev_addr *addr)
+ struct rdma_dev_addr *addr,
+ u32 seq)
{
if (dst->dev->flags & IFF_LOOPBACK) {
int ret;
@@ -331,17 +502,8 @@ static int addr_resolve_neigh(struct dst_entry *dst,
}
/* If the device doesn't do ARP internally */
- if (!(dst->dev->flags & IFF_NOARP)) {
- const struct sockaddr_in *dst_in4 =
- (const struct sockaddr_in *)dst_in;
- const struct sockaddr_in6 *dst_in6 =
- (const struct sockaddr_in6 *)dst_in;
-
- return dst_fetch_ha(dst, addr,
- dst_in->sa_family == AF_INET ?
- (const void *)&dst_in4->sin_addr.s_addr :
- (const void *)&dst_in6->sin6_addr);
- }
+ if (!(dst->dev->flags & IFF_NOARP))
+ return fetch_ha(dst, addr, dst_in, seq);
return rdma_copy_addr(addr, dst->dev, NULL);
}
@@ -349,7 +511,8 @@ static int addr_resolve_neigh(struct dst_entry *dst,
static int addr_resolve(struct sockaddr *src_in,
const struct sockaddr *dst_in,
struct rdma_dev_addr *addr,
- bool resolve_neigh)
+ bool resolve_neigh,
+ u32 seq)
{
struct net_device *ndev;
struct dst_entry *dst;
@@ -366,7 +529,7 @@ static int addr_resolve(struct sockaddr *src_in,
return ret;
if (resolve_neigh)
- ret = addr_resolve_neigh(&rt->dst, dst_in, addr);
+ ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq);
ndev = rt->dst.dev;
dev_hold(ndev);
@@ -383,7 +546,7 @@ static int addr_resolve(struct sockaddr *src_in,
return ret;
if (resolve_neigh)
- ret = addr_resolve_neigh(dst, dst_in, addr);
+ ret = addr_resolve_neigh(dst, dst_in, addr, seq);
ndev = dst->dev;
dev_hold(ndev);
@@ -412,7 +575,7 @@ static void process_req(struct work_struct *work)
src_in = (struct sockaddr *) &req->src_addr;
dst_in = (struct sockaddr *) &req->dst_addr;
req->status = addr_resolve(src_in, dst_in, req->addr,
- true);
+ true, req->seq);
if (req->status && time_after_eq(jiffies, req->timeout))
req->status = -ETIMEDOUT;
else if (req->status == -ENODATA)
@@ -471,8 +634,9 @@ int rdma_resolve_ip(struct rdma_addr_client *client,
req->context = context;
req->client = client;
atomic_inc(&client->refcount);
+ req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
- req->status = addr_resolve(src_in, dst_in, addr, true);
+ req->status = addr_resolve(src_in, dst_in, addr, true, req->seq);
switch (req->status) {
case 0:
req->timeout = jiffies;
@@ -510,7 +674,7 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr,
src_in->sa_family = dst_addr->sa_family;
}
- return addr_resolve(src_in, dst_addr, addr, false);
+ return addr_resolve(src_in, dst_addr, addr, false, 0);
}
EXPORT_SYMBOL(rdma_resolve_ip_route);
@@ -634,7 +798,7 @@ static struct notifier_block nb = {
.notifier_call = netevent_callback
};
-static int __init addr_init(void)
+int addr_init(void)
{
addr_wq = create_singlethread_workqueue("ib_addr");
if (!addr_wq)
@@ -642,15 +806,13 @@ static int __init addr_init(void)
register_netevent_notifier(&nb);
rdma_addr_register_client(&self);
+
return 0;
}
-static void __exit addr_cleanup(void)
+void addr_cleanup(void)
{
rdma_addr_unregister_client(&self);
unregister_netevent_notifier(&nb);
destroy_workqueue(addr_wq);
}
-
-module_init(addr_init);
-module_exit(addr_cleanup);
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index c2e257d97eff..1a2984c28b95 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -178,6 +178,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
{
int ret = 0;
struct net_device *old_net_dev;
+ enum ib_gid_type old_gid_type;
/* in rdma_cap_roce_gid_table, this funciton should be protected by a
* sleep-able lock.
@@ -199,6 +200,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
}
old_net_dev = table->data_vec[ix].attr.ndev;
+ old_gid_type = table->data_vec[ix].attr.gid_type;
if (old_net_dev && old_net_dev != attr->ndev)
dev_put(old_net_dev);
/* if modify_gid failed, just delete the old gid */
@@ -207,10 +209,14 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
attr = &zattr;
table->data_vec[ix].context = NULL;
}
- if (default_gid)
- table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
+
memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid));
memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr));
+ if (default_gid) {
+ table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
+ if (action == GID_TABLE_WRITE_ACTION_DEL)
+ table->data_vec[ix].attr.gid_type = old_gid_type;
+ }
if (table->data_vec[ix].attr.ndev &&
table->data_vec[ix].attr.ndev != old_net_dev)
dev_hold(table->data_vec[ix].attr.ndev);
@@ -405,7 +411,9 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
for (ix = 0; ix < table->sz; ix++)
if (table->data_vec[ix].attr.ndev == ndev)
- if (!del_gid(ib_dev, port, table, ix, false))
+ if (!del_gid(ib_dev, port, table, ix,
+ !!(table->data_vec[ix].props &
+ GID_TABLE_ENTRY_DEFAULT)))
deleted = true;
write_unlock_irq(&table->rwlock);
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 1d92e091e22e..c99525512b34 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3452,14 +3452,14 @@ static int cm_establish(struct ib_cm_id *cm_id)
work->cm_event.event = IB_CM_USER_ESTABLISHED;
/* Check if the device started its remove_one */
- spin_lock_irq(&cm.lock);
+ spin_lock_irqsave(&cm.lock, flags);
if (!cm_dev->going_down) {
queue_delayed_work(cm.wq, &work->work, 0);
} else {
kfree(work);
ret = -ENODEV;
}
- spin_unlock_irq(&cm.lock);
+ spin_unlock_irqrestore(&cm.lock, flags);
out:
return ret;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 93ab0ae97208..5f65a78b27c9 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -68,6 +68,7 @@ MODULE_DESCRIPTION("Generic RDMA CM Agent");
MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_QUERY_CLASSPORT_INFO_TIMEOUT 3000
#define CMA_MAX_CM_RETRIES 15
#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
#define CMA_IBOE_PACKET_LIFETIME 18
@@ -162,6 +163,14 @@ struct rdma_bind_list {
unsigned short port;
};
+struct class_port_info_context {
+ struct ib_class_port_info *class_port_info;
+ struct ib_device *device;
+ struct completion done;
+ struct ib_sa_query *sa_query;
+ u8 port_num;
+};
+
static int cma_ps_alloc(struct net *net, enum rdma_port_space ps,
struct rdma_bind_list *bind_list, int snum)
{
@@ -306,6 +315,7 @@ struct cma_multicast {
struct sockaddr_storage addr;
struct kref mcref;
bool igmp_joined;
+ u8 join_state;
};
struct cma_work {
@@ -708,17 +718,6 @@ static void cma_deref_id(struct rdma_id_private *id_priv)
complete(&id_priv->comp);
}
-static int cma_disable_callback(struct rdma_id_private *id_priv,
- enum rdma_cm_state state)
-{
- mutex_lock(&id_priv->handler_mutex);
- if (id_priv->state != state) {
- mutex_unlock(&id_priv->handler_mutex);
- return -EINVAL;
- }
- return 0;
-}
-
struct rdma_cm_id *rdma_create_id(struct net *net,
rdma_cm_event_handler event_handler,
void *context, enum rdma_port_space ps,
@@ -800,6 +799,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
if (id->device != pd->device)
return -EINVAL;
+ qp_init_attr->port_num = id->port_num;
qp = ib_create_qp(pd, qp_init_attr);
if (IS_ERR(qp))
return PTR_ERR(qp);
@@ -1670,11 +1670,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
struct rdma_cm_event event;
int ret = 0;
+ mutex_lock(&id_priv->handler_mutex);
if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
- cma_disable_callback(id_priv, RDMA_CM_CONNECT)) ||
+ id_priv->state != RDMA_CM_CONNECT) ||
(ib_event->event == IB_CM_TIMEWAIT_EXIT &&
- cma_disable_callback(id_priv, RDMA_CM_DISCONNECT)))
- return 0;
+ id_priv->state != RDMA_CM_DISCONNECT))
+ goto out;
memset(&event, 0, sizeof event);
switch (ib_event->event) {
@@ -1869,7 +1870,7 @@ static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_e
static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
{
- struct rdma_id_private *listen_id, *conn_id;
+ struct rdma_id_private *listen_id, *conn_id = NULL;
struct rdma_cm_event event;
struct net_device *net_dev;
int offset, ret;
@@ -1883,9 +1884,10 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
goto net_dev_put;
}
- if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) {
+ mutex_lock(&listen_id->handler_mutex);
+ if (listen_id->state != RDMA_CM_LISTEN) {
ret = -ECONNABORTED;
- goto net_dev_put;
+ goto err1;
}
memset(&event, 0, sizeof event);
@@ -1975,8 +1977,9 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
- if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
- return 0;
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != RDMA_CM_CONNECT)
+ goto out;
memset(&event, 0, sizeof event);
switch (iw_event->event) {
@@ -2028,6 +2031,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
return ret;
}
+out:
mutex_unlock(&id_priv->handler_mutex);
return ret;
}
@@ -2038,13 +2042,15 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
struct rdma_cm_id *new_cm_id;
struct rdma_id_private *listen_id, *conn_id;
struct rdma_cm_event event;
- int ret;
+ int ret = -ECONNABORTED;
struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
listen_id = cm_id->context;
- if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
- return -ECONNABORTED;
+
+ mutex_lock(&listen_id->handler_mutex);
+ if (listen_id->state != RDMA_CM_LISTEN)
+ goto out;
/* Create a new RDMA id for the new IW CM ID */
new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net,
@@ -2456,18 +2462,24 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
if (addr->dev_addr.bound_dev_if) {
ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
- if (!ndev)
- return -ENODEV;
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err2;
+ }
if (ndev->flags & IFF_LOOPBACK) {
dev_put(ndev);
- if (!id_priv->id.device->get_netdev)
- return -EOPNOTSUPP;
+ if (!id_priv->id.device->get_netdev) {
+ ret = -EOPNOTSUPP;
+ goto err2;
+ }
ndev = id_priv->id.device->get_netdev(id_priv->id.device,
id_priv->id.port_num);
- if (!ndev)
- return -ENODEV;
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err2;
+ }
}
route->path_rec->net = &init_net;
@@ -3215,8 +3227,9 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
int ret = 0;
- if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
- return 0;
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != RDMA_CM_CONNECT)
+ goto out;
memset(&event, 0, sizeof event);
switch (ib_event->event) {
@@ -3672,12 +3685,13 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
struct rdma_id_private *id_priv;
struct cma_multicast *mc = multicast->context;
struct rdma_cm_event event;
- int ret;
+ int ret = 0;
id_priv = mc->id_priv;
- if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) &&
- cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED))
- return 0;
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != RDMA_CM_ADDR_BOUND &&
+ id_priv->state != RDMA_CM_ADDR_RESOLVED)
+ goto out;
if (!status)
status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
@@ -3719,6 +3733,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
return 0;
}
+out:
mutex_unlock(&id_priv->handler_mutex);
return 0;
}
@@ -3753,10 +3768,63 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
}
}
+static void cma_query_sa_classport_info_cb(int status,
+ struct ib_class_port_info *rec,
+ void *context)
+{
+ struct class_port_info_context *cb_ctx = context;
+
+ WARN_ON(!context);
+
+ if (status || !rec) {
+ pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n",
+ cb_ctx->device->name, cb_ctx->port_num, status);
+ goto out;
+ }
+
+ memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info));
+
+out:
+ complete(&cb_ctx->done);
+}
+
+static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num,
+ struct ib_class_port_info *class_port_info)
+{
+ struct class_port_info_context *cb_ctx;
+ int ret;
+
+ cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL);
+ if (!cb_ctx)
+ return -ENOMEM;
+
+ cb_ctx->device = device;
+ cb_ctx->class_port_info = class_port_info;
+ cb_ctx->port_num = port_num;
+ init_completion(&cb_ctx->done);
+
+ ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num,
+ CMA_QUERY_CLASSPORT_INFO_TIMEOUT,
+ GFP_KERNEL, cma_query_sa_classport_info_cb,
+ cb_ctx, &cb_ctx->sa_query);
+ if (ret < 0) {
+ pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n",
+ device->name, port_num, ret);
+ goto out;
+ }
+
+ wait_for_completion(&cb_ctx->done);
+
+out:
+ kfree(cb_ctx);
+ return ret;
+}
+
static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
struct cma_multicast *mc)
{
struct ib_sa_mcmember_rec rec;
+ struct ib_class_port_info class_port_info;
struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
ib_sa_comp_mask comp_mask;
int ret;
@@ -3775,7 +3843,24 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
rec.qkey = cpu_to_be32(id_priv->qkey);
rdma_addr_get_sgid(dev_addr, &rec.port_gid);
rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
- rec.join_state = 1;
+ rec.join_state = mc->join_state;
+
+ if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) {
+ ret = cma_query_sa_classport_info(id_priv->id.device,
+ id_priv->id.port_num,
+ &class_port_info);
+
+ if (ret)
+ return ret;
+
+ if (!(ib_get_cpi_capmask2(&class_port_info) &
+ IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) {
+ pr_warn("RDMA CM: %s port %u Unable to multicast join\n"
+ "RDMA CM: SM doesn't support Send Only Full Member option\n",
+ id_priv->id.device->name, id_priv->id.port_num);
+ return -EOPNOTSUPP;
+ }
+ }
comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
@@ -3844,6 +3929,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
struct sockaddr *addr = (struct sockaddr *)&mc->addr;
struct net_device *ndev = NULL;
enum ib_gid_type gid_type;
+ bool send_only;
+
+ send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);
if (cma_zero_addr((struct sockaddr *)&mc->addr))
return -EINVAL;
@@ -3877,12 +3965,14 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
rdma_start_port(id_priv->cma_dev->device)];
if (addr->sa_family == AF_INET) {
- if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
- err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
- true);
- if (!err) {
- mc->igmp_joined = true;
+ if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+ if (!send_only) {
+ err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
+ true);
+ if (!err)
+ mc->igmp_joined = true;
+ }
}
} else {
if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
@@ -3912,7 +4002,7 @@ out1:
}
int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
- void *context)
+ u8 join_state, void *context)
{
struct rdma_id_private *id_priv;
struct cma_multicast *mc;
@@ -3931,6 +4021,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
mc->context = context;
mc->id_priv = id_priv;
mc->igmp_joined = false;
+ mc->join_state = join_state;
spin_lock(&id_priv->lock);
list_add(&mc->list, &id_priv->mc_list);
spin_unlock(&id_priv->lock);
@@ -4294,7 +4385,8 @@ static int __init cma_init(void)
if (ret)
goto err;
- if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
+ if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table),
+ cma_cb_table))
pr_warn("RDMA CMA: failed to add netlink callback\n");
cma_configfs_init();
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index eab32215756b..19d499dcab76 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -137,4 +137,20 @@ static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
return _upper == upper;
}
+int addr_init(void);
+void addr_cleanup(void);
+
+int ib_mad_init(void);
+void ib_mad_cleanup(void);
+
+int ib_sa_init(void);
+void ib_sa_cleanup(void);
+
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+ struct netlink_callback *cb);
+
#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 10979844026a..760ef603a468 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -311,6 +311,15 @@ static int read_port_immutable(struct ib_device *device)
return 0;
}
+void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len)
+{
+ if (dev->get_dev_fw_str)
+ dev->get_dev_fw_str(dev, str, str_len);
+ else
+ str[0] = '\0';
+}
+EXPORT_SYMBOL(ib_get_device_fw_str);
+
/**
* ib_register_device - Register an IB device with IB core
* @device:Device to register
@@ -661,6 +670,9 @@ int ib_query_port(struct ib_device *device,
if (err || port_attr->subnet_prefix)
return err;
+ if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
+ return 0;
+
err = ib_query_gid(device, port_num, 0, &gid, NULL);
if (err)
return err;
@@ -955,6 +967,29 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
}
EXPORT_SYMBOL(ib_get_net_dev_by_params);
+static struct ibnl_client_cbs ibnl_ls_cb_table[] = {
+ [RDMA_NL_LS_OP_RESOLVE] = {
+ .dump = ib_nl_handle_resolve_resp,
+ .module = THIS_MODULE },
+ [RDMA_NL_LS_OP_SET_TIMEOUT] = {
+ .dump = ib_nl_handle_set_timeout,
+ .module = THIS_MODULE },
+ [RDMA_NL_LS_OP_IP_RESOLVE] = {
+ .dump = ib_nl_handle_ip_res_resp,
+ .module = THIS_MODULE },
+};
+
+static int ib_add_ibnl_clients(void)
+{
+ return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table),
+ ibnl_ls_cb_table);
+}
+
+static void ib_remove_ibnl_clients(void)
+{
+ ibnl_remove_client(RDMA_NL_LS);
+}
+
static int __init ib_core_init(void)
{
int ret;
@@ -983,10 +1018,42 @@ static int __init ib_core_init(void)
goto err_sysfs;
}
+ ret = addr_init();
+ if (ret) {
+ pr_warn("Could't init IB address resolution\n");
+ goto err_ibnl;
+ }
+
+ ret = ib_mad_init();
+ if (ret) {
+ pr_warn("Couldn't init IB MAD\n");
+ goto err_addr;
+ }
+
+ ret = ib_sa_init();
+ if (ret) {
+ pr_warn("Couldn't init SA\n");
+ goto err_mad;
+ }
+
+ ret = ib_add_ibnl_clients();
+ if (ret) {
+ pr_warn("Couldn't register ibnl clients\n");
+ goto err_sa;
+ }
+
ib_cache_setup();
return 0;
+err_sa:
+ ib_sa_cleanup();
+err_mad:
+ ib_mad_cleanup();
+err_addr:
+ addr_cleanup();
+err_ibnl:
+ ibnl_cleanup();
err_sysfs:
class_unregister(&ib_class);
err_comp:
@@ -999,6 +1066,10 @@ err:
static void __exit ib_core_cleanup(void)
{
ib_cache_cleanup();
+ ib_remove_ibnl_clients();
+ ib_sa_cleanup();
+ ib_mad_cleanup();
+ addr_cleanup();
ibnl_cleanup();
class_unregister(&ib_class);
destroy_workqueue(ib_comp_wq);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index e28a160cdab0..357624f8b9d3 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -183,15 +183,14 @@ static void free_cm_id(struct iwcm_id_private *cm_id_priv)
/*
* Release a reference on cm_id. If the last reference is being
- * released, enable the waiting thread (in iw_destroy_cm_id) to
- * get woken up, and return 1 if a thread is already waiting.
+ * released, free the cm_id and return 1.
*/
static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
{
BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
if (atomic_dec_and_test(&cm_id_priv->refcount)) {
BUG_ON(!list_empty(&cm_id_priv->work_list));
- complete(&cm_id_priv->destroy_comp);
+ free_cm_id(cm_id_priv);
return 1;
}
@@ -208,19 +207,10 @@ static void add_ref(struct iw_cm_id *cm_id)
static void rem_ref(struct iw_cm_id *cm_id)
{
struct iwcm_id_private *cm_id_priv;
- int cb_destroy;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
- /*
- * Test bit before deref in case the cm_id gets freed on another
- * thread.
- */
- cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
- if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
- BUG_ON(!list_empty(&cm_id_priv->work_list));
- free_cm_id(cm_id_priv);
- }
+ (void)iwcm_deref_id(cm_id_priv);
}
static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
@@ -370,6 +360,12 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
wait_event(cm_id_priv->connect_wait,
!test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+ /*
+ * Since we're deleting the cm_id, drop any events that
+ * might arrive before the last dereference.
+ */
+ set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
+
spin_lock_irqsave(&cm_id_priv->lock, flags);
switch (cm_id_priv->state) {
case IW_CM_STATE_LISTEN:
@@ -433,13 +429,7 @@ void iw_destroy_cm_id(struct iw_cm_id *cm_id)
struct iwcm_id_private *cm_id_priv;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
- BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
-
destroy_cm_id(cm_id);
-
- wait_for_completion(&cm_id_priv->destroy_comp);
-
- free_cm_id(cm_id_priv);
}
EXPORT_SYMBOL(iw_destroy_cm_id);
@@ -459,7 +449,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr,
if (pm_addr->ss_family == AF_INET) {
struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr;
- if (pm4_addr->sin_addr.s_addr == INADDR_ANY) {
+ if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) {
struct sockaddr_in *cm4_addr =
(struct sockaddr_in *)cm_addr;
struct sockaddr_in *cm4_outaddr =
@@ -809,10 +799,7 @@ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
ret = cm_id->cm_handler(cm_id, iw_event);
if (ret) {
iw_cm_reject(cm_id, NULL, 0);
- set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
- destroy_cm_id(cm_id);
- if (atomic_read(&cm_id_priv->refcount)==0)
- free_cm_id(cm_id_priv);
+ iw_destroy_cm_id(cm_id);
}
out:
@@ -1000,7 +987,6 @@ static void cm_work_handler(struct work_struct *_work)
unsigned long flags;
int empty;
int ret = 0;
- int destroy_id;
spin_lock_irqsave(&cm_id_priv->lock, flags);
empty = list_empty(&cm_id_priv->work_list);
@@ -1013,20 +999,14 @@ static void cm_work_handler(struct work_struct *_work)
put_work(work);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
- ret = process_event(cm_id_priv, &levent);
- if (ret) {
- set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
- destroy_cm_id(&cm_id_priv->id);
- }
- BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
- destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
- if (iwcm_deref_id(cm_id_priv)) {
- if (destroy_id) {
- BUG_ON(!list_empty(&cm_id_priv->work_list));
- free_cm_id(cm_id_priv);
- }
+ if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
+ ret = process_event(cm_id_priv, &levent);
+ if (ret)
+ destroy_cm_id(&cm_id_priv->id);
+ } else
+ pr_debug("dropping event %d\n", levent.event);
+ if (iwcm_deref_id(cm_id_priv))
return;
- }
if (empty)
return;
spin_lock_irqsave(&cm_id_priv->lock, flags);
@@ -1175,7 +1155,7 @@ static int __init iw_cm_init(void)
if (ret)
pr_err("iw_cm: couldn't init iwpm\n");
- ret = ibnl_add_client(RDMA_NL_IWCM, RDMA_NL_IWPM_NUM_OPS,
+ ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table),
iwcm_nl_cb_table);
if (ret)
pr_err("iw_cm: couldn't register netlink callbacks\n");
diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h
index 3f6cc82564c8..82c2cd1b0a80 100644
--- a/drivers/infiniband/core/iwcm.h
+++ b/drivers/infiniband/core/iwcm.h
@@ -56,7 +56,7 @@ struct iwcm_id_private {
struct list_head work_free_list;
};
-#define IWCM_F_CALLBACK_DESTROY 1
+#define IWCM_F_DROP_EVENTS 1
#define IWCM_F_CONNECT_WAIT 2
#endif /* IWCM_H */
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 43e3fa27102b..1c41b95cefec 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -506,7 +506,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
if (!nlmsg_request) {
pr_info("%s: Could not find a matching request (seq = %u)\n",
__func__, msg_seq);
- return -EINVAL;
+ return -EINVAL;
}
pm_msg = nlmsg_request->req_buffer;
local_sockaddr = (struct sockaddr_storage *)
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
index 9b2bf2fb2b00..ade71e7f0131 100644
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -37,6 +37,7 @@
#define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1)
#define IWPM_REMINFO_HASH_SIZE 64
#define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1)
+#define IWPM_MSG_SIZE 512
static LIST_HEAD(iwpm_nlmsg_req_list);
static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock);
@@ -452,7 +453,7 @@ struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
{
struct sk_buff *skb = NULL;
- skb = dev_alloc_skb(NLMSG_GOODSIZE);
+ skb = dev_alloc_skb(IWPM_MSG_SIZE);
if (!skb) {
pr_err("%s Unable to allocate skb\n", __func__);
goto create_nlmsg_exit;
@@ -634,6 +635,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
pr_warn("%s Unable to put NLMSG_DONE\n", __func__);
+ dev_kfree_skb(skb);
return -ENOMEM;
}
nlh->nlmsg_type = NLMSG_DONE;
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 9fa5bf33f5a3..2d49228f28b2 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -47,11 +47,7 @@
#include "smi.h"
#include "opa_smi.h"
#include "agent.h"
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("kernel IB MAD API");
-MODULE_AUTHOR("Hal Rosenstock");
-MODULE_AUTHOR("Sean Hefty");
+#include "core_priv.h"
static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
@@ -1642,9 +1638,9 @@ static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
/* Now, check to see if there are any methods still in use */
if (!check_method_table(method)) {
/* If not, release management method table */
- kfree(method);
- class->method_table[mgmt_class] = NULL;
- /* Any management classes left ? */
+ kfree(method);
+ class->method_table[mgmt_class] = NULL;
+ /* Any management classes left ? */
if (!check_class_table(class)) {
/* If not, release management class table */
kfree(class);
@@ -3316,7 +3312,7 @@ static struct ib_client mad_client = {
.remove = ib_mad_remove_device
};
-static int __init ib_mad_init_module(void)
+int ib_mad_init(void)
{
mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
@@ -3334,10 +3330,7 @@ static int __init ib_mad_init_module(void)
return 0;
}
-static void __exit ib_mad_cleanup_module(void)
+void ib_mad_cleanup(void)
{
ib_unregister_client(&mad_client);
}
-
-module_init(ib_mad_init_module);
-module_exit(ib_mad_cleanup_module);
diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c
new file mode 100644
index 000000000000..49d478b2ea94
--- /dev/null
+++ b/drivers/infiniband/core/mr_pool.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/mr_pool.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ mr = list_first_entry_or_null(list, struct ib_mr, qp_entry);
+ if (mr) {
+ list_del(&mr->qp_entry);
+ qp->mrs_used++;
+ }
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_mr_pool_get);
+
+void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ list_add(&mr->qp_entry, list);
+ qp->mrs_used--;
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_put);
+
+int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
+ enum ib_mr_type type, u32 max_num_sg)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+ int ret, i;
+
+ for (i = 0; i < nr; i++) {
+ mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+ if (IS_ERR(mr)) {
+ ret = PTR_ERR(mr);
+ goto out;
+ }
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ list_add_tail(&mr->qp_entry, list);
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+ }
+
+ return 0;
+out:
+ ib_mr_pool_destroy(qp, list);
+ return ret;
+}
+EXPORT_SYMBOL(ib_mr_pool_init);
+
+void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ while (!list_empty(list)) {
+ mr = list_first_entry(list, struct ib_mr, qp_entry);
+ list_del(&mr->qp_entry);
+
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+ ib_dereg_mr(mr);
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_destroy);
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index 250937cb9a1a..51c79b2fb0b8 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -102,11 +102,10 @@ struct mcast_group {
struct list_head pending_list;
struct list_head active_list;
struct mcast_member *last_join;
- int members[3];
+ int members[NUM_JOIN_MEMBERSHIP_TYPES];
atomic_t refcount;
enum mcast_group_state state;
struct ib_sa_query *query;
- int query_id;
u16 pkey_index;
u8 leave_state;
int retries;
@@ -220,8 +219,9 @@ static void queue_join(struct mcast_member *member)
}
/*
- * A multicast group has three types of members: full member, non member, and
- * send only member. We need to keep track of the number of members of each
+ * A multicast group has four types of members: full member, non member,
+ * sendonly non member and sendonly full member.
+ * We need to keep track of the number of members of each
* type based on their join state. Adjust the number of members the belong to
* the specified join states.
*/
@@ -229,7 +229,7 @@ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
{
int i;
- for (i = 0; i < 3; i++, join_state >>= 1)
+ for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1)
if (join_state & 0x1)
group->members[i] += inc;
}
@@ -245,7 +245,7 @@ static u8 get_leave_state(struct mcast_group *group)
u8 leave_state = 0;
int i;
- for (i = 0; i < 3; i++)
+ for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++)
if (!group->members[i])
leave_state |= (0x1 << i);
@@ -339,11 +339,7 @@ static int send_join(struct mcast_group *group, struct mcast_member *member)
member->multicast.comp_mask,
3000, GFP_KERNEL, join_handler, group,
&group->query);
- if (ret >= 0) {
- group->query_id = ret;
- ret = 0;
- }
- return ret;
+ return (ret > 0) ? 0 : ret;
}
static int send_leave(struct mcast_group *group, u8 leave_state)
@@ -363,11 +359,7 @@ static int send_leave(struct mcast_group *group, u8 leave_state)
IB_SA_MCMEMBER_REC_JOIN_STATE,
3000, GFP_KERNEL, leave_handler,
group, &group->query);
- if (ret >= 0) {
- group->query_id = ret;
- ret = 0;
- }
- return ret;
+ return (ret > 0) ? 0 : ret;
}
static void join_group(struct mcast_group *group, struct mcast_member *member,
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index d47df9356779..10469b0088b5 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -151,12 +151,11 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct ibnl_client *client;
int type = nlh->nlmsg_type;
int index = RDMA_NL_GET_CLIENT(type);
- int op = RDMA_NL_GET_OP(type);
+ unsigned int op = RDMA_NL_GET_OP(type);
list_for_each_entry(client, &client_list, list) {
if (client->index == index) {
- if (op < 0 || op >= client->nops ||
- !client->cb_table[op].dump)
+ if (op >= client->nops || !client->cb_table[op].dump)
return -EINVAL;
/*
@@ -230,7 +229,10 @@ static void ibnl_rcv(struct sk_buff *skb)
int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
__u32 pid)
{
- return nlmsg_unicast(nls, skb, pid);
+ int err;
+
+ err = netlink_unicast(nls, skb, pid, 0);
+ return (err < 0) ? err : 0;
}
EXPORT_SYMBOL(ibnl_unicast);
@@ -253,6 +255,7 @@ int __init ibnl_init(void)
return -ENOMEM;
}
+ nls->sk_sndtimeo = 10 * HZ;
return 0;
}
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
new file mode 100644
index 000000000000..dbfd854c32c9
--- /dev/null
+++ b/drivers/infiniband/core/rw.c
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <rdma/mr_pool.h>
+#include <rdma/rw.h>
+
+enum {
+ RDMA_RW_SINGLE_WR,
+ RDMA_RW_MULTI_WR,
+ RDMA_RW_MR,
+ RDMA_RW_SIG_MR,
+};
+
+static bool rdma_rw_force_mr;
+module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
+MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
+
+/*
+ * Check if the device might use memory registration. This is currently only
+ * true for iWarp devices. In the future we can hopefully fine tune this based
+ * on HCA driver input.
+ */
+static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num)
+{
+ if (rdma_protocol_iwarp(dev, port_num))
+ return true;
+ if (unlikely(rdma_rw_force_mr))
+ return true;
+ return false;
+}
+
+/*
+ * Check if the device will use memory registration for this RW operation.
+ * We currently always use memory registrations for iWarp RDMA READs, and
+ * have a debug option to force usage of MRs.
+ *
+ * XXX: In the future we can hopefully fine tune this based on HCA driver
+ * input.
+ */
+static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
+ enum dma_data_direction dir, int dma_nents)
+{
+ if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE)
+ return true;
+ if (unlikely(rdma_rw_force_mr))
+ return true;
+ return false;
+}
+
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
+{
+ /* arbitrary limit to avoid allocating gigantic resources */
+ return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
+}
+
+/* Caller must have zero-initialized *reg. */
+static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
+ struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
+ u32 sg_cnt, u32 offset)
+{
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+ u32 nents = min(sg_cnt, pages_per_mr);
+ int count = 0, ret;
+
+ reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
+ if (!reg->mr)
+ return -EAGAIN;
+
+ if (reg->mr->need_inval) {
+ reg->inv_wr.opcode = IB_WR_LOCAL_INV;
+ reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
+ reg->inv_wr.next = &reg->reg_wr.wr;
+ count++;
+ } else {
+ reg->inv_wr.next = NULL;
+ }
+
+ ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
+ if (ret < nents) {
+ ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
+ return -EINVAL;
+ }
+
+ reg->reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg->reg_wr.mr = reg->mr;
+ reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+ if (rdma_protocol_iwarp(qp->device, port_num))
+ reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+ count++;
+
+ reg->sge.addr = reg->mr->iova;
+ reg->sge.length = reg->mr->length;
+ return count;
+}
+
+static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct rdma_rw_reg_ctx *prev = NULL;
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+ int i, j, ret = 0, count = 0;
+
+ ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr;
+ ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL);
+ if (!ctx->reg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < ctx->nr_ops; i++) {
+ struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
+ u32 nents = min(sg_cnt, pages_per_mr);
+
+ ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
+ offset);
+ if (ret < 0)
+ goto out_free;
+ count += ret;
+
+ if (prev) {
+ if (reg->mr->need_inval)
+ prev->wr.wr.next = &reg->inv_wr;
+ else
+ prev->wr.wr.next = &reg->reg_wr.wr;
+ }
+
+ reg->reg_wr.wr.next = &reg->wr.wr;
+
+ reg->wr.wr.sg_list = &reg->sge;
+ reg->wr.wr.num_sge = 1;
+ reg->wr.remote_addr = remote_addr;
+ reg->wr.rkey = rkey;
+ if (dir == DMA_TO_DEVICE) {
+ reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
+ } else if (!rdma_cap_read_inv(qp->device, port_num)) {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ;
+ } else {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
+ }
+ count++;
+
+ remote_addr += reg->sge.length;
+ sg_cnt -= nents;
+ for (j = 0; j < nents; j++)
+ sg = sg_next(sg);
+ prev = reg;
+ offset = 0;
+ }
+
+ if (prev)
+ prev->wr.wr.next = NULL;
+
+ ctx->type = RDMA_RW_MR;
+ return count;
+
+out_free:
+ while (--i >= 0)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ kfree(ctx->reg);
+out:
+ return ret;
+}
+
+static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ struct scatterlist *sg, u32 sg_cnt, u32 offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+ qp->max_read_sge;
+ struct ib_sge *sge;
+ u32 total_len = 0, i, j;
+
+ ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
+
+ ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL);
+ if (!ctx->map.sges)
+ goto out;
+
+ ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL);
+ if (!ctx->map.wrs)
+ goto out_free_sges;
+
+ for (i = 0; i < ctx->nr_ops; i++) {
+ struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
+ u32 nr_sge = min(sg_cnt, max_sge);
+
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ rdma_wr->remote_addr = remote_addr + total_len;
+ rdma_wr->rkey = rkey;
+ rdma_wr->wr.num_sge = nr_sge;
+ rdma_wr->wr.sg_list = sge;
+
+ for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
+ sge->addr = ib_sg_dma_address(dev, sg) + offset;
+ sge->length = ib_sg_dma_len(dev, sg) - offset;
+ sge->lkey = qp->pd->local_dma_lkey;
+
+ total_len += sge->length;
+ sge++;
+ sg_cnt--;
+ offset = 0;
+ }
+
+ rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
+ &ctx->map.wrs[i + 1].wr : NULL;
+ }
+
+ ctx->type = RDMA_RW_MULTI_WR;
+ return ctx->nr_ops;
+
+out_free_sges:
+ kfree(ctx->map.sges);
+out:
+ return -ENOMEM;
+}
+
+static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
+ enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
+
+ ctx->nr_ops = 1;
+
+ ctx->single.sge.lkey = qp->pd->local_dma_lkey;
+ ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset;
+ ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset;
+
+ memset(rdma_wr, 0, sizeof(*rdma_wr));
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ rdma_wr->wr.sg_list = &ctx->single.sge;
+ rdma_wr->wr.num_sge = 1;
+ rdma_wr->remote_addr = remote_addr;
+ rdma_wr->rkey = rkey;
+
+ ctx->type = RDMA_RW_SINGLE_WR;
+ return 1;
+}
+
+/**
+ * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
+ * @ctx: context to initialize
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist to READ/WRITE from/to
+ * @sg_cnt: number of entries in @sg
+ * @sg_offset: current byte offset into @sg
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey: remote key to operate on
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+ struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ int ret;
+
+ ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+ if (!ret)
+ return -ENOMEM;
+ sg_cnt = ret;
+
+ /*
+ * Skip to the S/G entry that sg_offset falls into:
+ */
+ for (;;) {
+ u32 len = ib_sg_dma_len(dev, sg);
+
+ if (sg_offset < len)
+ break;
+
+ sg = sg_next(sg);
+ sg_offset -= len;
+ sg_cnt--;
+ }
+
+ ret = -EIO;
+ if (WARN_ON_ONCE(sg_cnt == 0))
+ goto out_unmap_sg;
+
+ if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
+ ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
+ sg_offset, remote_addr, rkey, dir);
+ } else if (sg_cnt > 1) {
+ ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
+ remote_addr, rkey, dir);
+ } else {
+ ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
+ remote_addr, rkey, dir);
+ }
+
+ if (ret < 0)
+ goto out_unmap_sg;
+ return ret;
+
+out_unmap_sg:
+ ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_init);
+
+/**
+ * rdma_rw_ctx_signature init - initialize a RW context with signature offload
+ * @ctx: context to initialize
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist to READ/WRITE from/to
+ * @sg_cnt: number of entries in @sg
+ * @prot_sg: scatterlist to READ/WRITE protection information from/to
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @sig_attrs: signature offloading algorithms
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey: remote key to operate on
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+ struct scatterlist *prot_sg, u32 prot_sg_cnt,
+ struct ib_sig_attrs *sig_attrs,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
+ struct ib_rdma_wr *rdma_wr;
+ struct ib_send_wr *prev_wr = NULL;
+ int count = 0, ret;
+
+ if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
+ pr_err("SG count too large\n");
+ return -EINVAL;
+ }
+
+ ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+ if (!ret)
+ return -ENOMEM;
+ sg_cnt = ret;
+
+ ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+ if (!ret) {
+ ret = -ENOMEM;
+ goto out_unmap_sg;
+ }
+ prot_sg_cnt = ret;
+
+ ctx->type = RDMA_RW_SIG_MR;
+ ctx->nr_ops = 1;
+ ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL);
+ if (!ctx->sig) {
+ ret = -ENOMEM;
+ goto out_unmap_prot_sg;
+ }
+
+ ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0);
+ if (ret < 0)
+ goto out_free_ctx;
+ count += ret;
+ prev_wr = &ctx->sig->data.reg_wr.wr;
+
+ if (prot_sg_cnt) {
+ ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot,
+ prot_sg, prot_sg_cnt, 0);
+ if (ret < 0)
+ goto out_destroy_data_mr;
+ count += ret;
+
+ if (ctx->sig->prot.inv_wr.next)
+ prev_wr->next = &ctx->sig->prot.inv_wr;
+ else
+ prev_wr->next = &ctx->sig->prot.reg_wr.wr;
+ prev_wr = &ctx->sig->prot.reg_wr.wr;
+ } else {
+ ctx->sig->prot.mr = NULL;
+ }
+
+ ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+ if (!ctx->sig->sig_mr) {
+ ret = -EAGAIN;
+ goto out_destroy_prot_mr;
+ }
+
+ if (ctx->sig->sig_mr->need_inval) {
+ memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr));
+
+ ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV;
+ ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey;
+
+ prev_wr->next = &ctx->sig->sig_inv_wr;
+ prev_wr = &ctx->sig->sig_inv_wr;
+ }
+
+ ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
+ ctx->sig->sig_wr.wr.wr_cqe = NULL;
+ ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge;
+ ctx->sig->sig_wr.wr.num_sge = 1;
+ ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
+ ctx->sig->sig_wr.sig_attrs = sig_attrs;
+ ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr;
+ if (prot_sg_cnt)
+ ctx->sig->sig_wr.prot = &ctx->sig->prot.sge;
+ prev_wr->next = &ctx->sig->sig_wr.wr;
+ prev_wr = &ctx->sig->sig_wr.wr;
+ count++;
+
+ ctx->sig->sig_sge.addr = 0;
+ ctx->sig->sig_sge.length = ctx->sig->data.sge.length;
+ if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE)
+ ctx->sig->sig_sge.length += ctx->sig->prot.sge.length;
+
+ rdma_wr = &ctx->sig->data.wr;
+ rdma_wr->wr.sg_list = &ctx->sig->sig_sge;
+ rdma_wr->wr.num_sge = 1;
+ rdma_wr->remote_addr = remote_addr;
+ rdma_wr->rkey = rkey;
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ prev_wr->next = &rdma_wr->wr;
+ prev_wr = &rdma_wr->wr;
+ count++;
+
+ return count;
+
+out_destroy_prot_mr:
+ if (prot_sg_cnt)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+out_destroy_data_mr:
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+out_free_ctx:
+ kfree(ctx->sig);
+out_unmap_prot_sg:
+ ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+out_unmap_sg:
+ ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
+
+/*
+ * Now that we are going to post the WRs we can update the lkey and need_inval
+ * state on the MRs. If we were doing this at init time, we would get double
+ * or missing invalidations if a context was initialized but not actually
+ * posted.
+ */
+static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
+{
+ reg->mr->need_inval = need_inval;
+ ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
+ reg->reg_wr.key = reg->mr->lkey;
+ reg->sge.lkey = reg->mr->lkey;
+}
+
+/**
+ * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
+ * @ctx: context to operate on
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @cqe: completion queue entry for the last WR
+ * @chain_wr: WR to append to the posted chain
+ *
+ * Return the WR chain for the set of RDMA READ/WRITE operations described by
+ * @ctx, as well as any memory registration operations needed. If @chain_wr
+ * is non-NULL the WR it points to will be appended to the chain of WRs posted.
+ * If @chain_wr is not set @cqe must be set so that the caller gets a
+ * completion notification.
+ */
+struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+ struct ib_send_wr *first_wr, *last_wr;
+ int i;
+
+ switch (ctx->type) {
+ case RDMA_RW_SIG_MR:
+ rdma_rw_update_lkey(&ctx->sig->data, true);
+ if (ctx->sig->prot.mr)
+ rdma_rw_update_lkey(&ctx->sig->prot, true);
+
+ ctx->sig->sig_mr->need_inval = true;
+ ib_update_fast_reg_key(ctx->sig->sig_mr,
+ ib_inc_rkey(ctx->sig->sig_mr->lkey));
+ ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey;
+
+ if (ctx->sig->data.inv_wr.next)
+ first_wr = &ctx->sig->data.inv_wr;
+ else
+ first_wr = &ctx->sig->data.reg_wr.wr;
+ last_wr = &ctx->sig->data.wr.wr;
+ break;
+ case RDMA_RW_MR:
+ for (i = 0; i < ctx->nr_ops; i++) {
+ rdma_rw_update_lkey(&ctx->reg[i],
+ ctx->reg[i].wr.wr.opcode !=
+ IB_WR_RDMA_READ_WITH_INV);
+ }
+
+ if (ctx->reg[0].inv_wr.next)
+ first_wr = &ctx->reg[0].inv_wr;
+ else
+ first_wr = &ctx->reg[0].reg_wr.wr;
+ last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
+ break;
+ case RDMA_RW_MULTI_WR:
+ first_wr = &ctx->map.wrs[0].wr;
+ last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
+ break;
+ case RDMA_RW_SINGLE_WR:
+ first_wr = &ctx->single.wr.wr;
+ last_wr = &ctx->single.wr.wr;
+ break;
+ default:
+ BUG();
+ }
+
+ if (chain_wr) {
+ last_wr->next = chain_wr;
+ } else {
+ last_wr->wr_cqe = cqe;
+ last_wr->send_flags |= IB_SEND_SIGNALED;
+ }
+
+ return first_wr;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_wrs);
+
+/**
+ * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
+ * @ctx: context to operate on
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @cqe: completion queue entry for the last WR
+ * @chain_wr: WR to append to the posted chain
+ *
+ * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
+ * any memory registration operations needed. If @chain_wr is non-NULL the
+ * WR it points to will be appended to the chain of WRs posted. If @chain_wr
+ * is not set @cqe must be set so that the caller gets a completion
+ * notification.
+ */
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+ struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+ struct ib_send_wr *first_wr, *bad_wr;
+
+ first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
+ return ib_post_send(qp, first_wr, &bad_wr);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_post);
+
+/**
+ * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist that was used for the READ/WRITE
+ * @sg_cnt: number of entries in @sg
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+ struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir)
+{
+ int i;
+
+ switch (ctx->type) {
+ case RDMA_RW_MR:
+ for (i = 0; i < ctx->nr_ops; i++)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ kfree(ctx->reg);
+ break;
+ case RDMA_RW_MULTI_WR:
+ kfree(ctx->map.wrs);
+ kfree(ctx->map.sges);
+ break;
+ case RDMA_RW_SINGLE_WR:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy);
+
+/**
+ * rdma_rw_ctx_destroy_signature - release all resources allocated by
+ * rdma_rw_ctx_init_signature
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist that was used for the READ/WRITE
+ * @sg_cnt: number of entries in @sg
+ * @prot_sg: scatterlist that was used for the READ/WRITE of the PI
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, struct scatterlist *sg, u32 sg_cnt,
+ struct scatterlist *prot_sg, u32 prot_sg_cnt,
+ enum dma_data_direction dir)
+{
+ if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
+ return;
+
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr);
+ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+
+ if (ctx->sig->prot.mr) {
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr);
+ ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
+ }
+
+ ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr);
+ kfree(ctx->sig);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
+
+void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
+{
+ u32 factor;
+
+ WARN_ON_ONCE(attr->port_num == 0);
+
+ /*
+ * Each context needs at least one RDMA READ or WRITE WR.
+ *
+ * For some hardware we might need more, eventually we should ask the
+ * HCA driver for a multiplier here.
+ */
+ factor = 1;
+
+ /*
+ * If the devices needs MRs to perform RDMA READ or WRITE operations,
+ * we'll need two additional MRs for the registrations and the
+ * invalidation.
+ */
+ if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
+ factor += 6; /* (inv + reg) * (data + prot + sig) */
+ else if (rdma_rw_can_use_mr(dev, attr->port_num))
+ factor += 2; /* inv + reg */
+
+ attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
+
+ /*
+ * But maybe we were just too high in the sky and the device doesn't
+ * even support all we need, and we'll have to live with what we get..
+ */
+ attr->cap.max_send_wr =
+ min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
+}
+
+int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 nr_mrs = 0, nr_sig_mrs = 0;
+ int ret = 0;
+
+ if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) {
+ nr_sig_mrs = attr->cap.max_rdma_ctxs;
+ nr_mrs = attr->cap.max_rdma_ctxs * 2;
+ } else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
+ nr_mrs = attr->cap.max_rdma_ctxs;
+ }
+
+ if (nr_mrs) {
+ ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
+ IB_MR_TYPE_MEM_REG,
+ rdma_rw_fr_page_list_len(dev));
+ if (ret) {
+ pr_err("%s: failed to allocated %d MRs\n",
+ __func__, nr_mrs);
+ return ret;
+ }
+ }
+
+ if (nr_sig_mrs) {
+ ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
+ IB_MR_TYPE_SIGNATURE, 2);
+ if (ret) {
+ pr_err("%s: failed to allocated %d SIG MRs\n",
+ __func__, nr_mrs);
+ goto out_free_rdma_mrs;
+ }
+ }
+
+ return 0;
+
+out_free_rdma_mrs:
+ ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+ return ret;
+}
+
+void rdma_rw_cleanup_mrs(struct ib_qp *qp)
+{
+ ib_mr_pool_destroy(qp, &qp->sig_mrs);
+ ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+}
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 8a09c0fb268d..b9bf7aa055e7 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -53,10 +53,6 @@
#include "sa.h"
#include "core_priv.h"
-MODULE_AUTHOR("Roland Dreier");
-MODULE_DESCRIPTION("InfiniBand subnet administration query support");
-MODULE_LICENSE("Dual BSD/GPL");
-
#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100
#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000
#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000
@@ -69,10 +65,17 @@ struct ib_sa_sm_ah {
u8 src_path_mask;
};
+struct ib_sa_classport_cache {
+ bool valid;
+ struct ib_class_port_info data;
+};
+
struct ib_sa_port {
struct ib_mad_agent *agent;
struct ib_sa_sm_ah *sm_ah;
struct work_struct update_task;
+ struct ib_sa_classport_cache classport_info;
+ spinlock_t classport_lock; /* protects class port info set */
spinlock_t ah_lock;
u8 port_num;
};
@@ -119,6 +122,12 @@ struct ib_sa_guidinfo_query {
struct ib_sa_query sa_query;
};
+struct ib_sa_classport_info_query {
+ void (*callback)(int, struct ib_class_port_info *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
struct ib_sa_mcmember_query {
void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
void *context;
@@ -392,6 +401,82 @@ static const struct ib_field service_rec_table[] = {
.size_bits = 2*64 },
};
+#define CLASSPORTINFO_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \
+ .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \
+ .field_name = "ib_class_port_info:" #field
+
+static const struct ib_field classport_info_rec_table[] = {
+ { CLASSPORTINFO_REC_FIELD(base_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { CLASSPORTINFO_REC_FIELD(class_version),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { CLASSPORTINFO_REC_FIELD(capability_mask),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { CLASSPORTINFO_REC_FIELD(redirect_tcslfl),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_lid),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(redirect_pkey),
+ .offset_words = 7,
+ .offset_bits = 16,
+ .size_bits = 16 },
+
+ { CLASSPORTINFO_REC_FIELD(redirect_qp),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_qkey),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 32 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_gid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { CLASSPORTINFO_REC_FIELD(trap_tcslfl),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 32 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_lid),
+ .offset_words = 15,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(trap_pkey),
+ .offset_words = 15,
+ .offset_bits = 16,
+ .size_bits = 16 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_hlqp),
+ .offset_words = 16,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(trap_qkey),
+ .offset_words = 17,
+ .offset_bits = 0,
+ .size_bits = 32 },
+};
+
#define GUIDINFO_REC_FIELD(field) \
.struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \
.struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \
@@ -536,7 +621,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask)
data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
if (!data) {
- kfree_skb(skb);
+ nlmsg_free(skb);
return -EMSGSIZE;
}
@@ -705,8 +790,8 @@ static void ib_nl_request_timeout(struct work_struct *work)
spin_unlock_irqrestore(&ib_nl_request_lock, flags);
}
-static int ib_nl_handle_set_timeout(struct sk_buff *skb,
- struct netlink_callback *cb)
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
int timeout, delta, abs_delta;
@@ -782,8 +867,8 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
return 1;
}
-static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
- struct netlink_callback *cb)
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
unsigned long flags;
@@ -838,15 +923,6 @@ resp_out:
return skb->len;
}
-static struct ibnl_client_cbs ib_sa_cb_table[] = {
- [RDMA_NL_LS_OP_RESOLVE] = {
- .dump = ib_nl_handle_resolve_resp,
- .module = THIS_MODULE },
- [RDMA_NL_LS_OP_SET_TIMEOUT] = {
- .dump = ib_nl_handle_set_timeout,
- .module = THIS_MODULE },
-};
-
static void free_sm_ah(struct kref *kref)
{
struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -929,6 +1005,13 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event
port->sm_ah = NULL;
spin_unlock_irqrestore(&port->ah_lock, flags);
+ if (event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER ||
+ event->event == IB_EVENT_LID_CHANGE) {
+ spin_lock_irqsave(&port->classport_lock, flags);
+ port->classport_info.valid = false;
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ }
queue_work(ib_wq, &sa_dev->port[event->element.port_num -
sa_dev->start_port].update_task);
}
@@ -1645,6 +1728,121 @@ err1:
}
EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
+/* Support get SA ClassPortInfo */
+static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ unsigned long flags;
+ struct ib_sa_classport_info_query *query =
+ container_of(sa_query, struct ib_sa_classport_info_query, sa_query);
+
+ if (mad) {
+ struct ib_class_port_info rec;
+
+ ib_unpack(classport_info_rec_table,
+ ARRAY_SIZE(classport_info_rec_table),
+ mad->data, &rec);
+
+ spin_lock_irqsave(&sa_query->port->classport_lock, flags);
+ if (!status && !sa_query->port->classport_info.valid) {
+ memcpy(&sa_query->port->classport_info.data, &rec,
+ sizeof(sa_query->port->classport_info.data));
+
+ sa_query->port->classport_info.valid = true;
+ }
+ spin_unlock_irqrestore(&sa_query->port->classport_lock, flags);
+
+ query->callback(status, &rec, query->context);
+ } else {
+ query->callback(status, NULL, query->context);
+ }
+}
+
+static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_classport_info_query,
+ sa_query));
+}
+
+int ib_sa_classport_info_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_class_port_info *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_classport_info_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ struct ib_class_port_info cached_class_port_info;
+ int ret;
+ unsigned long flags;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ /* Use cached ClassPortInfo attribute if valid instead of sending mad */
+ spin_lock_irqsave(&port->classport_lock, flags);
+ if (port->classport_info.valid && callback) {
+ memcpy(&cached_class_port_info, &port->classport_info.data,
+ sizeof(cached_class_port_info));
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ callback(0, &cached_class_port_info, context);
+ return 0;
+ }
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL;
+
+ query->sa_query.release = ib_sa_portclass_info_rec_release;
+ /* support GET only */
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO);
+ mad->sa_hdr.comp_mask = 0;
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_classport_info_rec_query);
+
static void send_handler(struct ib_mad_agent *agent,
struct ib_mad_send_wc *mad_send_wc)
{
@@ -1725,6 +1923,9 @@ static void ib_sa_add_one(struct ib_device *device)
sa_dev->port[i].sm_ah = NULL;
sa_dev->port[i].port_num = i + s;
+ spin_lock_init(&sa_dev->port[i].classport_lock);
+ sa_dev->port[i].classport_info.valid = false;
+
sa_dev->port[i].agent =
ib_register_mad_agent(device, i + s, IB_QPT_GSI,
NULL, 0, send_handler,
@@ -1794,7 +1995,7 @@ static void ib_sa_remove_one(struct ib_device *device, void *client_data)
kfree(sa_dev);
}
-static int __init ib_sa_init(void)
+int ib_sa_init(void)
{
int ret;
@@ -1820,17 +2021,10 @@ static int __init ib_sa_init(void)
goto err3;
}
- if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS,
- ib_sa_cb_table)) {
- pr_err("Failed to add netlink callback\n");
- ret = -EINVAL;
- goto err4;
- }
INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
return 0;
-err4:
- destroy_workqueue(ib_nl_wq);
+
err3:
mcast_cleanup();
err2:
@@ -1839,9 +2033,8 @@ err1:
return ret;
}
-static void __exit ib_sa_cleanup(void)
+void ib_sa_cleanup(void)
{
- ibnl_remove_client(RDMA_NL_LS);
cancel_delayed_work(&ib_nl_timed_work);
flush_workqueue(ib_nl_wq);
destroy_workqueue(ib_nl_wq);
@@ -1849,6 +2042,3 @@ static void __exit ib_sa_cleanup(void)
ib_unregister_client(&sa_client);
idr_destroy(&query_idr);
}
-
-module_init(ib_sa_init);
-module_exit(ib_sa_cleanup);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 14606afbfaa8..15defefecb4f 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -38,6 +38,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/netdevice.h>
+#include <linux/ethtool.h>
#include <rdma/ib_mad.h>
#include <rdma/ib_pma.h>
@@ -56,8 +57,10 @@ struct ib_port {
struct gid_attr_group *gid_attr_group;
struct attribute_group gid_group;
struct attribute_group pkey_group;
- u8 port_num;
struct attribute_group *pma_table;
+ struct attribute_group *hw_stats_ag;
+ struct rdma_hw_stats *hw_stats;
+ u8 port_num;
};
struct port_attribute {
@@ -80,6 +83,18 @@ struct port_table_attribute {
__be16 attr_id;
};
+struct hw_stats_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t count);
+ int index;
+ u8 port_num;
+};
+
static ssize_t port_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
@@ -516,6 +531,7 @@ static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192);
static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224);
static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256);
static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288);
+static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320);
/*
* Counters added by extended set
@@ -546,6 +562,7 @@ static struct attribute *pma_attrs[] = {
&port_pma_attr_port_rcv_data.attr.attr,
&port_pma_attr_port_xmit_packets.attr.attr,
&port_pma_attr_port_rcv_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
NULL
};
@@ -565,6 +582,7 @@ static struct attribute *pma_attrs_ext[] = {
&port_pma_attr_ext_port_xmit_data.attr.attr,
&port_pma_attr_ext_port_rcv_data.attr.attr,
&port_pma_attr_ext_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
&port_pma_attr_ext_port_rcv_packets.attr.attr,
&port_pma_attr_ext_unicast_rcv_packets.attr.attr,
&port_pma_attr_ext_unicast_xmit_packets.attr.attr,
@@ -590,6 +608,7 @@ static struct attribute *pma_attrs_noietf[] = {
&port_pma_attr_ext_port_rcv_data.attr.attr,
&port_pma_attr_ext_port_xmit_packets.attr.attr,
&port_pma_attr_ext_port_rcv_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
NULL
};
@@ -733,6 +752,220 @@ static struct attribute_group *get_counter_table(struct ib_device *dev,
return &pma_group;
}
+static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
+ u8 port_num, int index)
+{
+ int ret;
+
+ if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan))
+ return 0;
+ ret = dev->get_hw_stats(dev, stats, port_num, index);
+ if (ret < 0)
+ return ret;
+ if (ret == stats->num_counters)
+ stats->timestamp = jiffies;
+
+ return 0;
+}
+
+static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+{
+ return sprintf(buf, "%llu\n", stats->value[index]);
+}
+
+static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct ib_device *dev;
+ struct ib_port *port;
+ struct hw_stats_attribute *hsa;
+ struct rdma_hw_stats *stats;
+ int ret;
+
+ hsa = container_of(attr, struct hw_stats_attribute, attr);
+ if (!hsa->port_num) {
+ dev = container_of((struct device *)kobj,
+ struct ib_device, dev);
+ stats = dev->hw_stats;
+ } else {
+ port = container_of(kobj, struct ib_port, kobj);
+ dev = port->ibdev;
+ stats = port->hw_stats;
+ }
+ ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
+ if (ret)
+ return ret;
+ return print_hw_stat(stats, hsa->index, buf);
+}
+
+static ssize_t show_stats_lifespan(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct hw_stats_attribute *hsa;
+ int msecs;
+
+ hsa = container_of(attr, struct hw_stats_attribute, attr);
+ if (!hsa->port_num) {
+ struct ib_device *dev = container_of((struct device *)kobj,
+ struct ib_device, dev);
+ msecs = jiffies_to_msecs(dev->hw_stats->lifespan);
+ } else {
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+ msecs = jiffies_to_msecs(p->hw_stats->lifespan);
+ }
+ return sprintf(buf, "%d\n", msecs);
+}
+
+static ssize_t set_stats_lifespan(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hw_stats_attribute *hsa;
+ int msecs;
+ int jiffies;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &msecs);
+ if (ret)
+ return ret;
+ if (msecs < 0 || msecs > 10000)
+ return -EINVAL;
+ jiffies = msecs_to_jiffies(msecs);
+ hsa = container_of(attr, struct hw_stats_attribute, attr);
+ if (!hsa->port_num) {
+ struct ib_device *dev = container_of((struct device *)kobj,
+ struct ib_device, dev);
+ dev->hw_stats->lifespan = jiffies;
+ } else {
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+ p->hw_stats->lifespan = jiffies;
+ }
+ return count;
+}
+
+static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group)
+{
+ struct attribute **attr;
+
+ sysfs_remove_group(kobj, attr_group);
+
+ for (attr = attr_group->attrs; *attr; attr++)
+ kfree(*attr);
+ kfree(attr_group);
+}
+
+static struct attribute *alloc_hsa(int index, u8 port_num, const char *name)
+{
+ struct hw_stats_attribute *hsa;
+
+ hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+ if (!hsa)
+ return NULL;
+
+ hsa->attr.name = (char *)name;
+ hsa->attr.mode = S_IRUGO;
+ hsa->show = show_hw_stats;
+ hsa->store = NULL;
+ hsa->index = index;
+ hsa->port_num = port_num;
+
+ return &hsa->attr;
+}
+
+static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num)
+{
+ struct hw_stats_attribute *hsa;
+
+ hsa = kmalloc(sizeof(*hsa), GFP_KERNEL);
+ if (!hsa)
+ return NULL;
+
+ hsa->attr.name = name;
+ hsa->attr.mode = S_IWUSR | S_IRUGO;
+ hsa->show = show_stats_lifespan;
+ hsa->store = set_stats_lifespan;
+ hsa->index = 0;
+ hsa->port_num = port_num;
+
+ return &hsa->attr;
+}
+
+static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
+ u8 port_num)
+{
+ struct attribute_group *hsag;
+ struct rdma_hw_stats *stats;
+ int i, ret;
+
+ stats = device->alloc_hw_stats(device, port_num);
+
+ if (!stats)
+ return;
+
+ if (!stats->names || stats->num_counters <= 0)
+ goto err_free_stats;
+
+ /*
+ * Two extra attribue elements here, one for the lifespan entry and
+ * one to NULL terminate the list for the sysfs core code
+ */
+ hsag = kzalloc(sizeof(*hsag) +
+ sizeof(void *) * (stats->num_counters + 2),
+ GFP_KERNEL);
+ if (!hsag)
+ goto err_free_stats;
+
+ ret = device->get_hw_stats(device, stats, port_num,
+ stats->num_counters);
+ if (ret != stats->num_counters)
+ goto err_free_hsag;
+
+ stats->timestamp = jiffies;
+
+ hsag->name = "hw_counters";
+ hsag->attrs = (void *)hsag + sizeof(*hsag);
+
+ for (i = 0; i < stats->num_counters; i++) {
+ hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]);
+ if (!hsag->attrs[i])
+ goto err;
+ sysfs_attr_init(hsag->attrs[i]);
+ }
+
+ /* treat an error here as non-fatal */
+ hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num);
+ if (hsag->attrs[i])
+ sysfs_attr_init(hsag->attrs[i]);
+
+ if (port) {
+ struct kobject *kobj = &port->kobj;
+ ret = sysfs_create_group(kobj, hsag);
+ if (ret)
+ goto err;
+ port->hw_stats_ag = hsag;
+ port->hw_stats = stats;
+ } else {
+ struct kobject *kobj = &device->dev.kobj;
+ ret = sysfs_create_group(kobj, hsag);
+ if (ret)
+ goto err;
+ device->hw_stats_ag = hsag;
+ device->hw_stats = stats;
+ }
+
+ return;
+
+err:
+ for (; i >= 0; i--)
+ kfree(hsag->attrs[i]);
+err_free_hsag:
+ kfree(hsag);
+err_free_stats:
+ kfree(stats);
+ return;
+}
+
static int add_port(struct ib_device *device, int port_num,
int (*port_callback)(struct ib_device *,
u8, struct kobject *))
@@ -835,6 +1068,14 @@ static int add_port(struct ib_device *device, int port_num,
goto err_remove_pkey;
}
+ /*
+ * If port == 0, it means we have only one port and the parent
+ * device, not this port device, should be the holder of the
+ * hw_counters
+ */
+ if (device->alloc_hw_stats && port_num)
+ setup_hw_stats(device, p, port_num);
+
list_add_tail(&p->kobj.entry, &device->port_list);
kobject_uevent(&p->kobj, KOBJ_ADD);
@@ -960,130 +1201,28 @@ static ssize_t set_node_desc(struct device *device,
return count;
}
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ ib_get_device_fw_str(dev, buf, PAGE_SIZE);
+ strlcat(buf, "\n", PAGE_SIZE);
+ return strlen(buf);
+}
+
static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static struct device_attribute *ib_class_attributes[] = {
&dev_attr_node_type,
&dev_attr_sys_image_guid,
&dev_attr_node_guid,
- &dev_attr_node_desc
-};
-
-/* Show a given an attribute in the statistics group */
-static ssize_t show_protocol_stat(const struct device *device,
- struct device_attribute *attr, char *buf,
- unsigned offset)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
- union rdma_protocol_stats stats;
- ssize_t ret;
-
- ret = dev->get_protocol_stats(dev, &stats);
- if (ret)
- return ret;
-
- return sprintf(buf, "%llu\n",
- (unsigned long long) ((u64 *) &stats)[offset]);
-}
-
-/* generate a read-only iwarp statistics attribute */
-#define IW_STATS_ENTRY(name) \
-static ssize_t show_##name(struct device *device, \
- struct device_attribute *attr, char *buf) \
-{ \
- return show_protocol_stat(device, attr, buf, \
- offsetof(struct iw_protocol_stats, name) / \
- sizeof (u64)); \
-} \
-static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
-
-IW_STATS_ENTRY(ipInReceives);
-IW_STATS_ENTRY(ipInHdrErrors);
-IW_STATS_ENTRY(ipInTooBigErrors);
-IW_STATS_ENTRY(ipInNoRoutes);
-IW_STATS_ENTRY(ipInAddrErrors);
-IW_STATS_ENTRY(ipInUnknownProtos);
-IW_STATS_ENTRY(ipInTruncatedPkts);
-IW_STATS_ENTRY(ipInDiscards);
-IW_STATS_ENTRY(ipInDelivers);
-IW_STATS_ENTRY(ipOutForwDatagrams);
-IW_STATS_ENTRY(ipOutRequests);
-IW_STATS_ENTRY(ipOutDiscards);
-IW_STATS_ENTRY(ipOutNoRoutes);
-IW_STATS_ENTRY(ipReasmTimeout);
-IW_STATS_ENTRY(ipReasmReqds);
-IW_STATS_ENTRY(ipReasmOKs);
-IW_STATS_ENTRY(ipReasmFails);
-IW_STATS_ENTRY(ipFragOKs);
-IW_STATS_ENTRY(ipFragFails);
-IW_STATS_ENTRY(ipFragCreates);
-IW_STATS_ENTRY(ipInMcastPkts);
-IW_STATS_ENTRY(ipOutMcastPkts);
-IW_STATS_ENTRY(ipInBcastPkts);
-IW_STATS_ENTRY(ipOutBcastPkts);
-IW_STATS_ENTRY(tcpRtoAlgorithm);
-IW_STATS_ENTRY(tcpRtoMin);
-IW_STATS_ENTRY(tcpRtoMax);
-IW_STATS_ENTRY(tcpMaxConn);
-IW_STATS_ENTRY(tcpActiveOpens);
-IW_STATS_ENTRY(tcpPassiveOpens);
-IW_STATS_ENTRY(tcpAttemptFails);
-IW_STATS_ENTRY(tcpEstabResets);
-IW_STATS_ENTRY(tcpCurrEstab);
-IW_STATS_ENTRY(tcpInSegs);
-IW_STATS_ENTRY(tcpOutSegs);
-IW_STATS_ENTRY(tcpRetransSegs);
-IW_STATS_ENTRY(tcpInErrs);
-IW_STATS_ENTRY(tcpOutRsts);
-
-static struct attribute *iw_proto_stats_attrs[] = {
- &dev_attr_ipInReceives.attr,
- &dev_attr_ipInHdrErrors.attr,
- &dev_attr_ipInTooBigErrors.attr,
- &dev_attr_ipInNoRoutes.attr,
- &dev_attr_ipInAddrErrors.attr,
- &dev_attr_ipInUnknownProtos.attr,
- &dev_attr_ipInTruncatedPkts.attr,
- &dev_attr_ipInDiscards.attr,
- &dev_attr_ipInDelivers.attr,
- &dev_attr_ipOutForwDatagrams.attr,
- &dev_attr_ipOutRequests.attr,
- &dev_attr_ipOutDiscards.attr,
- &dev_attr_ipOutNoRoutes.attr,
- &dev_attr_ipReasmTimeout.attr,
- &dev_attr_ipReasmReqds.attr,
- &dev_attr_ipReasmOKs.attr,
- &dev_attr_ipReasmFails.attr,
- &dev_attr_ipFragOKs.attr,
- &dev_attr_ipFragFails.attr,
- &dev_attr_ipFragCreates.attr,
- &dev_attr_ipInMcastPkts.attr,
- &dev_attr_ipOutMcastPkts.attr,
- &dev_attr_ipInBcastPkts.attr,
- &dev_attr_ipOutBcastPkts.attr,
- &dev_attr_tcpRtoAlgorithm.attr,
- &dev_attr_tcpRtoMin.attr,
- &dev_attr_tcpRtoMax.attr,
- &dev_attr_tcpMaxConn.attr,
- &dev_attr_tcpActiveOpens.attr,
- &dev_attr_tcpPassiveOpens.attr,
- &dev_attr_tcpAttemptFails.attr,
- &dev_attr_tcpEstabResets.attr,
- &dev_attr_tcpCurrEstab.attr,
- &dev_attr_tcpInSegs.attr,
- &dev_attr_tcpOutSegs.attr,
- &dev_attr_tcpRetransSegs.attr,
- &dev_attr_tcpInErrs.attr,
- &dev_attr_tcpOutRsts.attr,
- NULL
-};
-
-static struct attribute_group iw_stats_group = {
- .name = "proto_stats",
- .attrs = iw_proto_stats_attrs,
+ &dev_attr_node_desc,
+ &dev_attr_fw_ver,
};
static void free_port_list_attributes(struct ib_device *device)
@@ -1093,6 +1232,10 @@ static void free_port_list_attributes(struct ib_device *device)
list_for_each_entry_safe(p, t, &device->port_list, entry) {
struct ib_port *port = container_of(p, struct ib_port, kobj);
list_del(&p->entry);
+ if (port->hw_stats) {
+ kfree(port->hw_stats);
+ free_hsag(&port->kobj, port->hw_stats_ag);
+ }
sysfs_remove_group(p, port->pma_table);
sysfs_remove_group(p, &port->pkey_group);
sysfs_remove_group(p, &port->gid_group);
@@ -1149,11 +1292,8 @@ int ib_device_register_sysfs(struct ib_device *device,
}
}
- if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
- ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
- if (ret)
- goto err_put;
- }
+ if (device->alloc_hw_stats)
+ setup_hw_stats(device, NULL, 0);
return 0;
@@ -1169,15 +1309,18 @@ err:
void ib_device_unregister_sysfs(struct ib_device *device)
{
- /* Hold kobject until ib_dealloc_device() */
- struct kobject *kobj_dev = kobject_get(&device->dev.kobj);
int i;
- if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats)
- sysfs_remove_group(kobj_dev, &iw_stats_group);
+ /* Hold kobject until ib_dealloc_device() */
+ kobject_get(&device->dev.kobj);
free_port_list_attributes(device);
+ if (device->hw_stats) {
+ kfree(device->hw_stats);
+ free_hsag(&device->dev.kobj, device->hw_stats_ag);
+ }
+
for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
device_remove_file(&device->dev, ib_class_attributes[i]);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index c0f3826abb30..2825ece91d3c 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -106,6 +106,7 @@ struct ucma_multicast {
int events_reported;
u64 uid;
+ u8 join_state;
struct list_head list;
struct sockaddr_storage addr;
};
@@ -1317,12 +1318,20 @@ static ssize_t ucma_process_join(struct ucma_file *file,
struct ucma_multicast *mc;
struct sockaddr *addr;
int ret;
+ u8 join_state;
if (out_len < sizeof(resp))
return -ENOSPC;
addr = (struct sockaddr *) &cmd->addr;
- if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+ if (!cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+ return -EINVAL;
+
+ if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER)
+ join_state = BIT(FULLMEMBER_JOIN);
+ else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER)
+ join_state = BIT(SENDONLY_FULLMEMBER_JOIN);
+ else
return -EINVAL;
ctx = ucma_get_ctx(file, cmd->id);
@@ -1335,10 +1344,11 @@ static ssize_t ucma_process_join(struct ucma_file *file,
ret = -ENOMEM;
goto err1;
}
-
+ mc->join_state = join_state;
mc->uid = cmd->uid;
memcpy(&mc->addr, addr, cmd->addr_size);
- ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+ ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
+ join_state, mc);
if (ret)
goto err2;
@@ -1382,7 +1392,7 @@ static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
join_cmd.uid = cmd.uid;
join_cmd.id = cmd.id;
join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr);
- join_cmd.reserved = 0;
+ join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
return ucma_process_join(file, &join_cmd, out_len);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index fe4d2e1a8b58..c68746ce6624 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -37,7 +37,6 @@
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/hugetlb.h>
-#include <linux/dma-attrs.h>
#include <linux/slab.h>
#include <rdma/ib_umem_odp.h>
@@ -92,12 +91,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
unsigned long npages;
int ret;
int i;
- DEFINE_DMA_ATTRS(attrs);
+ unsigned long dma_attrs = 0;
struct scatterlist *sg, *sg_list_start;
int need_release = 0;
if (dmasync)
- dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+ dma_attrs |= DMA_ATTR_WRITE_BARRIER;
if (!size)
return ERR_PTR(-EINVAL);
@@ -215,7 +214,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
umem->sg_head.sgl,
umem->npages,
DMA_BIDIRECTIONAL,
- &attrs);
+ dma_attrs);
if (umem->nmap <= 0) {
ret = -ENOMEM;
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 612ccfd39bf9..df26a741cda6 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -116,6 +116,7 @@ struct ib_uverbs_event_file {
struct ib_uverbs_file {
struct kref ref;
struct mutex mutex;
+ struct mutex cleanup_mutex; /* protect cleanup */
struct ib_uverbs_device *device;
struct ib_ucontext *ucontext;
struct ib_event_handler event_handler;
@@ -162,6 +163,10 @@ struct ib_uqp_object {
struct ib_uxrcd_object *uxrcd;
};
+struct ib_uwq_object {
+ struct ib_uevent_object uevent;
+};
+
struct ib_ucq_object {
struct ib_uobject uobject;
struct ib_uverbs_file *uverbs_file;
@@ -181,6 +186,8 @@ extern struct idr ib_uverbs_qp_idr;
extern struct idr ib_uverbs_srq_idr;
extern struct idr ib_uverbs_xrcd_idr;
extern struct idr ib_uverbs_rule_idr;
+extern struct idr ib_uverbs_wq_idr;
+extern struct idr ib_uverbs_rwq_ind_tbl_idr;
void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
@@ -199,6 +206,7 @@ void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
void ib_uverbs_event_handler(struct ib_event_handler *handler,
struct ib_event *event);
@@ -219,6 +227,7 @@ struct ib_uverbs_flow_spec {
struct ib_uverbs_flow_spec_eth eth;
struct ib_uverbs_flow_spec_ipv4 ipv4;
struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+ struct ib_uverbs_flow_spec_ipv6 ipv6;
};
};
@@ -275,5 +284,10 @@ IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
IB_UVERBS_DECLARE_EX_CMD(query_device);
IB_UVERBS_DECLARE_EX_CMD(create_cq);
IB_UVERBS_DECLARE_EX_CMD(create_qp);
+IB_UVERBS_DECLARE_EX_CMD(create_wq);
+IB_UVERBS_DECLARE_EX_CMD(modify_wq);
+IB_UVERBS_DECLARE_EX_CMD(destroy_wq);
+IB_UVERBS_DECLARE_EX_CMD(create_rwq_ind_table);
+IB_UVERBS_DECLARE_EX_CMD(destroy_rwq_ind_table);
#endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 6fdc7ecdaca0..f6647318138d 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -57,6 +57,8 @@ static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" };
static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" };
static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" };
+static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" };
/*
* The ib_uobject locking scheme is as follows:
@@ -243,6 +245,27 @@ static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
}
+static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0);
+}
+
+static void put_wq_read(struct ib_wq *wq)
+{
+ put_uobj_read(wq->uobject);
+}
+
+static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle,
+ struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0);
+}
+
+static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table)
+{
+ put_uobj_read(ind_table->uobject);
+}
+
static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context)
{
struct ib_uobject *uobj;
@@ -326,6 +349,8 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&ucontext->qp_list);
INIT_LIST_HEAD(&ucontext->srq_list);
INIT_LIST_HEAD(&ucontext->ah_list);
+ INIT_LIST_HEAD(&ucontext->wq_list);
+ INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list);
INIT_LIST_HEAD(&ucontext->xrcd_list);
INIT_LIST_HEAD(&ucontext->rule_list);
rcu_read_lock();
@@ -1747,9 +1772,11 @@ static int create_qp(struct ib_uverbs_file *file,
struct ib_srq *srq = NULL;
struct ib_qp *qp;
char *buf;
- struct ib_qp_init_attr attr;
+ struct ib_qp_init_attr attr = {};
struct ib_uverbs_ex_create_qp_resp resp;
int ret;
+ struct ib_rwq_ind_table *ind_tbl = NULL;
+ bool has_sq = true;
if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
return -EPERM;
@@ -1761,6 +1788,32 @@ static int create_qp(struct ib_uverbs_file *file,
init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext,
&qp_lock_class);
down_write(&obj->uevent.uobject.mutex);
+ if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) +
+ sizeof(cmd->rwq_ind_tbl_handle) &&
+ (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) {
+ ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle,
+ file->ucontext);
+ if (!ind_tbl) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ attr.rwq_ind_tbl = ind_tbl;
+ }
+
+ if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) +
+ sizeof(cmd->reserved1)) && cmd->reserved1) {
+ ret = -EOPNOTSUPP;
+ goto err_put;
+ }
+
+ if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ if (ind_tbl && !cmd->max_send_wr)
+ has_sq = false;
if (cmd->qp_type == IB_QPT_XRC_TGT) {
xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext,
@@ -1784,20 +1837,24 @@ static int create_qp(struct ib_uverbs_file *file,
}
}
- if (cmd->recv_cq_handle != cmd->send_cq_handle) {
- rcq = idr_read_cq(cmd->recv_cq_handle,
- file->ucontext, 0);
- if (!rcq) {
- ret = -EINVAL;
- goto err_put;
+ if (!ind_tbl) {
+ if (cmd->recv_cq_handle != cmd->send_cq_handle) {
+ rcq = idr_read_cq(cmd->recv_cq_handle,
+ file->ucontext, 0);
+ if (!rcq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
}
}
}
- scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
- rcq = rcq ?: scq;
+ if (has_sq)
+ scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq);
+ if (!ind_tbl)
+ rcq = rcq ?: scq;
pd = idr_read_pd(cmd->pd_handle, file->ucontext);
- if (!pd || !scq) {
+ if (!pd || (!scq && has_sq)) {
ret = -EINVAL;
goto err_put;
}
@@ -1833,7 +1890,8 @@ static int create_qp(struct ib_uverbs_file *file,
if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
IB_QP_CREATE_CROSS_CHANNEL |
IB_QP_CREATE_MANAGED_SEND |
- IB_QP_CREATE_MANAGED_RECV)) {
+ IB_QP_CREATE_MANAGED_RECV |
+ IB_QP_CREATE_SCATTER_FCS)) {
ret = -EINVAL;
goto err_put;
}
@@ -1863,16 +1921,20 @@ static int create_qp(struct ib_uverbs_file *file,
qp->send_cq = attr.send_cq;
qp->recv_cq = attr.recv_cq;
qp->srq = attr.srq;
+ qp->rwq_ind_tbl = ind_tbl;
qp->event_handler = attr.event_handler;
qp->qp_context = attr.qp_context;
qp->qp_type = attr.qp_type;
atomic_set(&qp->usecnt, 0);
atomic_inc(&pd->usecnt);
- atomic_inc(&attr.send_cq->usecnt);
+ if (attr.send_cq)
+ atomic_inc(&attr.send_cq->usecnt);
if (attr.recv_cq)
atomic_inc(&attr.recv_cq->usecnt);
if (attr.srq)
atomic_inc(&attr.srq->usecnt);
+ if (ind_tbl)
+ atomic_inc(&ind_tbl->usecnt);
}
qp->uobject = &obj->uevent.uobject;
@@ -1912,6 +1974,8 @@ static int create_qp(struct ib_uverbs_file *file,
put_cq_read(rcq);
if (srq)
put_srq_read(srq);
+ if (ind_tbl)
+ put_rwq_indirection_table_read(ind_tbl);
mutex_lock(&file->mutex);
list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
@@ -1939,6 +2003,8 @@ err_put:
put_cq_read(rcq);
if (srq)
put_srq_read(srq);
+ if (ind_tbl)
+ put_rwq_indirection_table_read(ind_tbl);
put_uobj_write(&obj->uevent.uobject);
return ret;
@@ -2032,7 +2098,7 @@ int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file,
if (err)
return err;
- if (cmd.comp_mask)
+ if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK)
return -EINVAL;
if (cmd.reserved)
@@ -3039,6 +3105,15 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
sizeof(struct ib_flow_ipv4_filter));
break;
+ case IB_FLOW_SPEC_IPV6:
+ ib_spec->ipv6.size = sizeof(struct ib_flow_spec_ipv6);
+ if (ib_spec->ipv6.size != kern_spec->ipv6.size)
+ return -EINVAL;
+ memcpy(&ib_spec->ipv6.val, &kern_spec->ipv6.val,
+ sizeof(struct ib_flow_ipv6_filter));
+ memcpy(&ib_spec->ipv6.mask, &kern_spec->ipv6.mask,
+ sizeof(struct ib_flow_ipv6_filter));
+ break;
case IB_FLOW_SPEC_TCP:
case IB_FLOW_SPEC_UDP:
ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
@@ -3055,6 +3130,445 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
return 0;
}
+int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_wq cmd = {};
+ struct ib_uverbs_ex_create_wq_resp resp = {};
+ struct ib_uwq_object *obj;
+ int err = 0;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct ib_wq *wq;
+ struct ib_wq_init_attr wq_init_attr = {};
+ size_t required_cmd_sz;
+ size_t required_resp_len;
+
+ required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge);
+ required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn);
+
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (ucore->outlen < required_resp_len)
+ return -ENOSPC;
+
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ err = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext,
+ &wq_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ err = -EINVAL;
+ goto err_uobj;
+ }
+
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq) {
+ err = -EINVAL;
+ goto err_put_pd;
+ }
+
+ wq_init_attr.cq = cq;
+ wq_init_attr.max_sge = cmd.max_sge;
+ wq_init_attr.max_wr = cmd.max_wr;
+ wq_init_attr.wq_context = file;
+ wq_init_attr.wq_type = cmd.wq_type;
+ wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ wq = pd->device->create_wq(pd, &wq_init_attr, uhw);
+ if (IS_ERR(wq)) {
+ err = PTR_ERR(wq);
+ goto err_put_cq;
+ }
+
+ wq->uobject = &obj->uevent.uobject;
+ obj->uevent.uobject.object = wq;
+ wq->wq_type = wq_init_attr.wq_type;
+ wq->cq = cq;
+ wq->pd = pd;
+ wq->device = pd->device;
+ wq->wq_context = wq_init_attr.wq_context;
+ atomic_set(&wq->usecnt, 0);
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&cq->usecnt);
+ wq->uobject = &obj->uevent.uobject;
+ obj->uevent.uobject.object = wq;
+ err = idr_add_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject);
+ if (err)
+ goto destroy_wq;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.wq_handle = obj->uevent.uobject.id;
+ resp.max_sge = wq_init_attr.max_sge;
+ resp.max_wr = wq_init_attr.max_wr;
+ resp.wqn = wq->wq_num;
+ resp.response_length = required_resp_len;
+ err = ib_copy_to_udata(ucore,
+ &resp, resp.response_length);
+ if (err)
+ goto err_copy;
+
+ put_pd_read(pd);
+ put_cq_read(cq);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->wq_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uevent.uobject.live = 1;
+ up_write(&obj->uevent.uobject.mutex);
+ return 0;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject);
+destroy_wq:
+ ib_destroy_wq(wq);
+err_put_cq:
+ put_cq_read(cq);
+err_put_pd:
+ put_pd_read(pd);
+err_uobj:
+ put_uobj_write(&obj->uevent.uobject);
+
+ return err;
+}
+
+int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_destroy_wq cmd = {};
+ struct ib_uverbs_ex_destroy_wq_resp resp = {};
+ struct ib_wq *wq;
+ struct ib_uobject *uobj;
+ struct ib_uwq_object *obj;
+ size_t required_cmd_sz;
+ size_t required_resp_len;
+ int ret;
+
+ required_cmd_sz = offsetof(typeof(cmd), wq_handle) + sizeof(cmd.wq_handle);
+ required_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (ucore->outlen < required_resp_len)
+ return -ENOSPC;
+
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ resp.response_length = required_resp_len;
+ uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle,
+ file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+
+ wq = uobj->object;
+ obj = container_of(uobj, struct ib_uwq_object, uevent.uobject);
+ ret = ib_destroy_wq(wq);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_wq_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ ib_uverbs_release_uevent(file, &obj->uevent);
+ resp.events_reported = obj->uevent.events_reported;
+ put_uobj(uobj);
+
+ ret = ib_copy_to_udata(ucore, &resp, resp.response_length);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_modify_wq cmd = {};
+ struct ib_wq *wq;
+ struct ib_wq_attr wq_attr = {};
+ size_t required_cmd_sz;
+ int ret;
+
+ required_cmd_sz = offsetof(typeof(cmd), curr_wq_state) + sizeof(cmd.curr_wq_state);
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (ret)
+ return ret;
+
+ if (!cmd.attr_mask)
+ return -EINVAL;
+
+ if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE))
+ return -EINVAL;
+
+ wq = idr_read_wq(cmd.wq_handle, file->ucontext);
+ if (!wq)
+ return -EINVAL;
+
+ wq_attr.curr_wq_state = cmd.curr_wq_state;
+ wq_attr.wq_state = cmd.wq_state;
+ ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw);
+ put_wq_read(wq);
+ return ret;
+}
+
+int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_create_rwq_ind_table cmd = {};
+ struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {};
+ struct ib_uobject *uobj;
+ int err = 0;
+ struct ib_rwq_ind_table_init_attr init_attr = {};
+ struct ib_rwq_ind_table *rwq_ind_tbl;
+ struct ib_wq **wqs = NULL;
+ u32 *wqs_handles = NULL;
+ struct ib_wq *wq = NULL;
+ int i, j, num_read_wqs;
+ u32 num_wq_handles;
+ u32 expected_in_size;
+ size_t required_cmd_sz_header;
+ size_t required_resp_len;
+
+ required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size);
+ required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num);
+
+ if (ucore->inlen < required_cmd_sz_header)
+ return -EINVAL;
+
+ if (ucore->outlen < required_resp_len)
+ return -ENOSPC;
+
+ err = ib_copy_from_udata(&cmd, ucore, required_cmd_sz_header);
+ if (err)
+ return err;
+
+ ucore->inbuf += required_cmd_sz_header;
+ ucore->inlen -= required_cmd_sz_header;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE)
+ return -EINVAL;
+
+ num_wq_handles = 1 << cmd.log_ind_tbl_size;
+ expected_in_size = num_wq_handles * sizeof(__u32);
+ if (num_wq_handles == 1)
+ /* input size for wq handles is u64 aligned */
+ expected_in_size += sizeof(__u32);
+
+ if (ucore->inlen < expected_in_size)
+ return -EINVAL;
+
+ if (ucore->inlen > expected_in_size &&
+ !ib_is_udata_cleared(ucore, expected_in_size,
+ ucore->inlen - expected_in_size))
+ return -EOPNOTSUPP;
+
+ wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles),
+ GFP_KERNEL);
+ if (!wqs_handles)
+ return -ENOMEM;
+
+ err = ib_copy_from_udata(wqs_handles, ucore,
+ num_wq_handles * sizeof(__u32));
+ if (err)
+ goto err_free;
+
+ wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL);
+ if (!wqs) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
+ for (num_read_wqs = 0; num_read_wqs < num_wq_handles;
+ num_read_wqs++) {
+ wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext);
+ if (!wq) {
+ err = -EINVAL;
+ goto put_wqs;
+ }
+
+ wqs[num_read_wqs] = wq;
+ }
+
+ uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+ if (!uobj) {
+ err = -ENOMEM;
+ goto put_wqs;
+ }
+
+ init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class);
+ down_write(&uobj->mutex);
+ init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size;
+ init_attr.ind_tbl = wqs;
+ rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw);
+
+ if (IS_ERR(rwq_ind_tbl)) {
+ err = PTR_ERR(rwq_ind_tbl);
+ goto err_uobj;
+ }
+
+ rwq_ind_tbl->ind_tbl = wqs;
+ rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size;
+ rwq_ind_tbl->uobject = uobj;
+ uobj->object = rwq_ind_tbl;
+ rwq_ind_tbl->device = ib_dev;
+ atomic_set(&rwq_ind_tbl->usecnt, 0);
+
+ for (i = 0; i < num_wq_handles; i++)
+ atomic_inc(&wqs[i]->usecnt);
+
+ err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+ if (err)
+ goto destroy_ind_tbl;
+
+ resp.ind_tbl_handle = uobj->id;
+ resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
+ resp.response_length = required_resp_len;
+
+ err = ib_copy_to_udata(ucore,
+ &resp, resp.response_length);
+ if (err)
+ goto err_copy;
+
+ kfree(wqs_handles);
+
+ for (j = 0; j < num_read_wqs; j++)
+ put_wq_read(wqs[j]);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+ return 0;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+destroy_ind_tbl:
+ ib_destroy_rwq_ind_table(rwq_ind_tbl);
+err_uobj:
+ put_uobj_write(uobj);
+put_wqs:
+ for (j = 0; j < num_read_wqs; j++)
+ put_wq_read(wqs[j]);
+err_free:
+ kfree(wqs_handles);
+ kfree(wqs);
+ return err;
+}
+
+int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {};
+ struct ib_rwq_ind_table *rwq_ind_tbl;
+ struct ib_uobject *uobj;
+ int ret;
+ struct ib_wq **ind_tbl;
+ size_t required_cmd_sz;
+
+ required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle);
+
+ if (ucore->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (ucore->inlen > sizeof(cmd) &&
+ !ib_is_udata_cleared(ucore, sizeof(cmd),
+ ucore->inlen - sizeof(cmd)))
+ return -EOPNOTSUPP;
+
+ ret = ib_copy_from_udata(&cmd, ucore, min(sizeof(cmd), ucore->inlen));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle,
+ file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ rwq_ind_tbl = uobj->object;
+ ind_tbl = rwq_ind_tbl->ind_tbl;
+
+ ret = ib_destroy_rwq_ind_table(rwq_ind_tbl);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+ kfree(ind_tbl);
+ return ret;
+}
+
int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
struct ib_device *ib_dev,
struct ib_udata *ucore,
@@ -3088,8 +3602,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
if (cmd.comp_mask)
return -EINVAL;
- if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER &&
- !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW))
+ if (!capable(CAP_NET_RAW))
return -EPERM;
if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
@@ -3655,6 +4168,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
resp.hca_core_clock = attr.hca_core_clock;
resp.response_length += sizeof(resp.hca_core_clock);
+ if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex))
+ goto end;
+
+ resp.device_cap_flags_ex = attr.device_cap_flags;
+ resp.response_length += sizeof(resp.device_cap_flags_ex);
end:
err = ib_copy_to_udata(ucore, &resp, resp.response_length);
return err;
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 31f422a70623..0012fa58c105 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -76,6 +76,8 @@ DEFINE_IDR(ib_uverbs_qp_idr);
DEFINE_IDR(ib_uverbs_srq_idr);
DEFINE_IDR(ib_uverbs_xrcd_idr);
DEFINE_IDR(ib_uverbs_rule_idr);
+DEFINE_IDR(ib_uverbs_wq_idr);
+DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr);
static DEFINE_SPINLOCK(map_lock);
static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -130,6 +132,11 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
[IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device,
[IB_USER_VERBS_EX_CMD_CREATE_CQ] = ib_uverbs_ex_create_cq,
[IB_USER_VERBS_EX_CMD_CREATE_QP] = ib_uverbs_ex_create_qp,
+ [IB_USER_VERBS_EX_CMD_CREATE_WQ] = ib_uverbs_ex_create_wq,
+ [IB_USER_VERBS_EX_CMD_MODIFY_WQ] = ib_uverbs_ex_modify_wq,
+ [IB_USER_VERBS_EX_CMD_DESTROY_WQ] = ib_uverbs_ex_destroy_wq,
+ [IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_ex_create_rwq_ind_table,
+ [IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_ex_destroy_rwq_ind_table,
};
static void ib_uverbs_add_one(struct ib_device *device);
@@ -265,6 +272,27 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
kfree(uqp);
}
+ list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) {
+ struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object;
+ struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
+
+ idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj);
+ ib_destroy_rwq_ind_table(rwq_ind_tbl);
+ kfree(ind_tbl);
+ kfree(uobj);
+ }
+
+ list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) {
+ struct ib_wq *wq = uobj->object;
+ struct ib_uwq_object *uwq =
+ container_of(uobj, struct ib_uwq_object, uevent.uobject);
+
+ idr_remove_uobj(&ib_uverbs_wq_idr, uobj);
+ ib_destroy_wq(wq);
+ ib_uverbs_release_uevent(file, &uwq->uevent);
+ kfree(uwq);
+ }
+
list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
struct ib_srq *srq = uobj->object;
struct ib_uevent_object *uevent =
@@ -568,6 +596,16 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
&uobj->events_reported);
}
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_uevent_object *uobj = container_of(event->element.wq->uobject,
+ struct ib_uevent_object, uobject);
+
+ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+ event->event, &uobj->event_list,
+ &uobj->events_reported);
+}
+
void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
{
struct ib_uevent_object *uobj;
@@ -931,6 +969,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
file->async_file = NULL;
kref_init(&file->ref);
mutex_init(&file->mutex);
+ mutex_init(&file->cleanup_mutex);
filp->private_data = file;
kobject_get(&dev->kobj);
@@ -956,18 +995,20 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
{
struct ib_uverbs_file *file = filp->private_data;
struct ib_uverbs_device *dev = file->device;
- struct ib_ucontext *ucontext = NULL;
+
+ mutex_lock(&file->cleanup_mutex);
+ if (file->ucontext) {
+ ib_uverbs_cleanup_ucontext(file, file->ucontext);
+ file->ucontext = NULL;
+ }
+ mutex_unlock(&file->cleanup_mutex);
mutex_lock(&file->device->lists_mutex);
- ucontext = file->ucontext;
- file->ucontext = NULL;
if (!file->is_closed) {
list_del(&file->list);
file->is_closed = 1;
}
mutex_unlock(&file->device->lists_mutex);
- if (ucontext)
- ib_uverbs_cleanup_ucontext(file, ucontext);
if (file->async_file)
kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
@@ -1181,22 +1222,30 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
mutex_lock(&uverbs_dev->lists_mutex);
while (!list_empty(&uverbs_dev->uverbs_file_list)) {
struct ib_ucontext *ucontext;
-
file = list_first_entry(&uverbs_dev->uverbs_file_list,
struct ib_uverbs_file, list);
file->is_closed = 1;
- ucontext = file->ucontext;
list_del(&file->list);
- file->ucontext = NULL;
kref_get(&file->ref);
mutex_unlock(&uverbs_dev->lists_mutex);
- /* We must release the mutex before going ahead and calling
- * disassociate_ucontext. disassociate_ucontext might end up
- * indirectly calling uverbs_close, for example due to freeing
- * the resources (e.g mmput).
- */
+
ib_uverbs_event_handler(&file->event_handler, &event);
+
+ mutex_lock(&file->cleanup_mutex);
+ ucontext = file->ucontext;
+ file->ucontext = NULL;
+ mutex_unlock(&file->cleanup_mutex);
+
+ /* At this point ib_uverbs_close cannot be running
+ * ib_uverbs_cleanup_ucontext
+ */
if (ucontext) {
+ /* We must release the mutex before going ahead and
+ * calling disassociate_ucontext. disassociate_ucontext
+ * might end up indirectly calling uverbs_close,
+ * for example due to freeing the resources
+ * (e.g mmput).
+ */
ib_dev->disassociate_ucontext(ucontext);
ib_uverbs_cleanup_ucontext(file, ucontext);
}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index b65b3541e732..f2b776efab3a 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -48,6 +48,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_addr.h>
+#include <rdma/rw.h>
#include "core_priv.h"
@@ -510,12 +511,16 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
ah_attr->grh.dgid = sgid;
if (!rdma_cap_eth_ah(device, port_num)) {
- ret = ib_find_cached_gid_by_port(device, &dgid,
- IB_GID_TYPE_IB,
- port_num, NULL,
- &gid_index);
- if (ret)
- return ret;
+ if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) {
+ ret = ib_find_cached_gid_by_port(device, &dgid,
+ IB_GID_TYPE_IB,
+ port_num, NULL,
+ &gid_index);
+ if (ret)
+ return ret;
+ } else {
+ gid_index = 0;
+ }
}
ah_attr->grh.sgid_index = (u8) gid_index;
@@ -723,62 +728,112 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
}
EXPORT_SYMBOL(ib_open_qp);
+static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_qp *real_qp = qp;
+
+ qp->event_handler = __ib_shared_qp_event_handler;
+ qp->qp_context = qp;
+ qp->pd = NULL;
+ qp->send_cq = qp->recv_cq = NULL;
+ qp->srq = NULL;
+ qp->xrcd = qp_init_attr->xrcd;
+ atomic_inc(&qp_init_attr->xrcd->usecnt);
+ INIT_LIST_HEAD(&qp->open_list);
+
+ qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+ qp_init_attr->qp_context);
+ if (!IS_ERR(qp))
+ __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+ else
+ real_qp->device->destroy_qp(real_qp);
+ return qp;
+}
+
struct ib_qp *ib_create_qp(struct ib_pd *pd,
struct ib_qp_init_attr *qp_init_attr)
{
- struct ib_qp *qp, *real_qp;
- struct ib_device *device;
+ struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
+ struct ib_qp *qp;
+ int ret;
+
+ if (qp_init_attr->rwq_ind_tbl &&
+ (qp_init_attr->recv_cq ||
+ qp_init_attr->srq || qp_init_attr->cap.max_recv_wr ||
+ qp_init_attr->cap.max_recv_sge))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * If the callers is using the RDMA API calculate the resources
+ * needed for the RDMA READ/WRITE operations.
+ *
+ * Note that these callers need to pass in a port number.
+ */
+ if (qp_init_attr->cap.max_rdma_ctxs)
+ rdma_rw_init_qp(device, qp_init_attr);
- device = pd ? pd->device : qp_init_attr->xrcd->device;
qp = device->create_qp(pd, qp_init_attr, NULL);
+ if (IS_ERR(qp))
+ return qp;
+
+ qp->device = device;
+ qp->real_qp = qp;
+ qp->uobject = NULL;
+ qp->qp_type = qp_init_attr->qp_type;
+ qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
+
+ atomic_set(&qp->usecnt, 0);
+ qp->mrs_used = 0;
+ spin_lock_init(&qp->mr_lock);
+ INIT_LIST_HEAD(&qp->rdma_mrs);
+ INIT_LIST_HEAD(&qp->sig_mrs);
+
+ if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
+ return ib_create_xrc_qp(qp, qp_init_attr);
+
+ qp->event_handler = qp_init_attr->event_handler;
+ qp->qp_context = qp_init_attr->qp_context;
+ if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+ qp->recv_cq = NULL;
+ qp->srq = NULL;
+ } else {
+ qp->recv_cq = qp_init_attr->recv_cq;
+ if (qp_init_attr->recv_cq)
+ atomic_inc(&qp_init_attr->recv_cq->usecnt);
+ qp->srq = qp_init_attr->srq;
+ if (qp->srq)
+ atomic_inc(&qp_init_attr->srq->usecnt);
+ }
- if (!IS_ERR(qp)) {
- qp->device = device;
- qp->real_qp = qp;
- qp->uobject = NULL;
- qp->qp_type = qp_init_attr->qp_type;
-
- atomic_set(&qp->usecnt, 0);
- if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
- qp->event_handler = __ib_shared_qp_event_handler;
- qp->qp_context = qp;
- qp->pd = NULL;
- qp->send_cq = qp->recv_cq = NULL;
- qp->srq = NULL;
- qp->xrcd = qp_init_attr->xrcd;
- atomic_inc(&qp_init_attr->xrcd->usecnt);
- INIT_LIST_HEAD(&qp->open_list);
-
- real_qp = qp;
- qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
- qp_init_attr->qp_context);
- if (!IS_ERR(qp))
- __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
- else
- real_qp->device->destroy_qp(real_qp);
- } else {
- qp->event_handler = qp_init_attr->event_handler;
- qp->qp_context = qp_init_attr->qp_context;
- if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
- qp->recv_cq = NULL;
- qp->srq = NULL;
- } else {
- qp->recv_cq = qp_init_attr->recv_cq;
- atomic_inc(&qp_init_attr->recv_cq->usecnt);
- qp->srq = qp_init_attr->srq;
- if (qp->srq)
- atomic_inc(&qp_init_attr->srq->usecnt);
- }
+ qp->pd = pd;
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->xrcd = NULL;
- qp->pd = pd;
- qp->send_cq = qp_init_attr->send_cq;
- qp->xrcd = NULL;
+ atomic_inc(&pd->usecnt);
+ if (qp_init_attr->send_cq)
+ atomic_inc(&qp_init_attr->send_cq->usecnt);
+ if (qp_init_attr->rwq_ind_tbl)
+ atomic_inc(&qp->rwq_ind_tbl->usecnt);
- atomic_inc(&pd->usecnt);
- atomic_inc(&qp_init_attr->send_cq->usecnt);
+ if (qp_init_attr->cap.max_rdma_ctxs) {
+ ret = rdma_rw_init_mrs(qp, qp_init_attr);
+ if (ret) {
+ pr_err("failed to init MR pool ret= %d\n", ret);
+ ib_destroy_qp(qp);
+ qp = ERR_PTR(ret);
}
}
+ /*
+ * Note: all hw drivers guarantee that max_send_sge is lower than
+ * the device RDMA WRITE SGE limit but not all hw drivers ensure that
+ * max_send_sge <= max_sge_rd.
+ */
+ qp->max_write_sge = qp_init_attr->cap.max_send_sge;
+ qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
+ device->attrs.max_sge_rd);
+
return qp;
}
EXPORT_SYMBOL(ib_create_qp);
@@ -1248,8 +1303,11 @@ int ib_destroy_qp(struct ib_qp *qp)
struct ib_pd *pd;
struct ib_cq *scq, *rcq;
struct ib_srq *srq;
+ struct ib_rwq_ind_table *ind_tbl;
int ret;
+ WARN_ON_ONCE(qp->mrs_used > 0);
+
if (atomic_read(&qp->usecnt))
return -EBUSY;
@@ -1260,6 +1318,10 @@ int ib_destroy_qp(struct ib_qp *qp)
scq = qp->send_cq;
rcq = qp->recv_cq;
srq = qp->srq;
+ ind_tbl = qp->rwq_ind_tbl;
+
+ if (!qp->uobject)
+ rdma_rw_cleanup_mrs(qp);
ret = qp->device->destroy_qp(qp);
if (!ret) {
@@ -1271,6 +1333,8 @@ int ib_destroy_qp(struct ib_qp *qp)
atomic_dec(&rcq->usecnt);
if (srq)
atomic_dec(&srq->usecnt);
+ if (ind_tbl)
+ atomic_dec(&ind_tbl->usecnt);
}
return ret;
@@ -1343,6 +1407,7 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
mr->pd = pd;
mr->uobject = NULL;
atomic_inc(&pd->usecnt);
+ mr->need_inval = false;
}
return mr;
@@ -1389,6 +1454,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
mr->pd = pd;
mr->uobject = NULL;
atomic_inc(&pd->usecnt);
+ mr->need_inval = false;
}
return mr;
@@ -1516,6 +1582,150 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
}
EXPORT_SYMBOL(ib_dealloc_xrcd);
+/**
+ * ib_create_wq - Creates a WQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the WQ.
+ * @wq_init_attr: A list of initial attributes required to create the
+ * WQ. If WQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created WQ.
+ *
+ * wq_init_attr->max_wr and wq_init_attr->max_sge determine
+ * the requested size of the WQ, and set to the actual values allocated
+ * on return.
+ * If ib_create_wq() succeeds, then max_wr and max_sge will always be
+ * at least as large as the requested values.
+ */
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *wq_attr)
+{
+ struct ib_wq *wq;
+
+ if (!pd->device->create_wq)
+ return ERR_PTR(-ENOSYS);
+
+ wq = pd->device->create_wq(pd, wq_attr, NULL);
+ if (!IS_ERR(wq)) {
+ wq->event_handler = wq_attr->event_handler;
+ wq->wq_context = wq_attr->wq_context;
+ wq->wq_type = wq_attr->wq_type;
+ wq->cq = wq_attr->cq;
+ wq->device = pd->device;
+ wq->pd = pd;
+ wq->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&wq_attr->cq->usecnt);
+ atomic_set(&wq->usecnt, 0);
+ }
+ return wq;
+}
+EXPORT_SYMBOL(ib_create_wq);
+
+/**
+ * ib_destroy_wq - Destroys the specified WQ.
+ * @wq: The WQ to destroy.
+ */
+int ib_destroy_wq(struct ib_wq *wq)
+{
+ int err;
+ struct ib_cq *cq = wq->cq;
+ struct ib_pd *pd = wq->pd;
+
+ if (atomic_read(&wq->usecnt))
+ return -EBUSY;
+
+ err = wq->device->destroy_wq(wq);
+ if (!err) {
+ atomic_dec(&pd->usecnt);
+ atomic_dec(&cq->usecnt);
+ }
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_wq);
+
+/**
+ * ib_modify_wq - Modifies the specified WQ.
+ * @wq: The WQ to modify.
+ * @wq_attr: On input, specifies the WQ attributes to modify.
+ * @wq_attr_mask: A bit-mask used to specify which attributes of the WQ
+ * are being modified.
+ * On output, the current values of selected WQ attributes are returned.
+ */
+int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+ u32 wq_attr_mask)
+{
+ int err;
+
+ if (!wq->device->modify_wq)
+ return -ENOSYS;
+
+ err = wq->device->modify_wq(wq, wq_attr, wq_attr_mask, NULL);
+ return err;
+}
+EXPORT_SYMBOL(ib_modify_wq);
+
+/*
+ * ib_create_rwq_ind_table - Creates a RQ Indirection Table.
+ * @device: The device on which to create the rwq indirection table.
+ * @ib_rwq_ind_table_init_attr: A list of initial attributes required to
+ * create the Indirection Table.
+ *
+ * Note: The life time of ib_rwq_ind_table_init_attr->ind_tbl is not less
+ * than the created ib_rwq_ind_table object and the caller is responsible
+ * for its memory allocation/free.
+ */
+struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
+ struct ib_rwq_ind_table_init_attr *init_attr)
+{
+ struct ib_rwq_ind_table *rwq_ind_table;
+ int i;
+ u32 table_size;
+
+ if (!device->create_rwq_ind_table)
+ return ERR_PTR(-ENOSYS);
+
+ table_size = (1 << init_attr->log_ind_tbl_size);
+ rwq_ind_table = device->create_rwq_ind_table(device,
+ init_attr, NULL);
+ if (IS_ERR(rwq_ind_table))
+ return rwq_ind_table;
+
+ rwq_ind_table->ind_tbl = init_attr->ind_tbl;
+ rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size;
+ rwq_ind_table->device = device;
+ rwq_ind_table->uobject = NULL;
+ atomic_set(&rwq_ind_table->usecnt, 0);
+
+ for (i = 0; i < table_size; i++)
+ atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt);
+
+ return rwq_ind_table;
+}
+EXPORT_SYMBOL(ib_create_rwq_ind_table);
+
+/*
+ * ib_destroy_rwq_ind_table - Destroys the specified Indirection Table.
+ * @wq_ind_table: The Indirection Table to destroy.
+*/
+int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table)
+{
+ int err, i;
+ u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size);
+ struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl;
+
+ if (atomic_read(&rwq_ind_table->usecnt))
+ return -EBUSY;
+
+ err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table);
+ if (!err) {
+ for (i = 0; i < table_size; i++)
+ atomic_dec(&ind_tbl[i]->usecnt);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_rwq_ind_table);
+
struct ib_flow *ib_create_flow(struct ib_qp *qp,
struct ib_flow_attr *flow_attr,
int domain)
@@ -1597,6 +1807,7 @@ EXPORT_SYMBOL(ib_set_vf_guid);
* @mr: memory region
* @sg: dma mapped scatterlist
* @sg_nents: number of entries in sg
+ * @sg_offset: offset in bytes into sg
* @page_size: page vector desired page size
*
* Constraints:
@@ -1615,17 +1826,15 @@ EXPORT_SYMBOL(ib_set_vf_guid);
* After this completes successfully, the memory region
* is ready for registration.
*/
-int ib_map_mr_sg(struct ib_mr *mr,
- struct scatterlist *sg,
- int sg_nents,
- unsigned int page_size)
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset, unsigned int page_size)
{
if (unlikely(!mr->device->map_mr_sg))
return -ENOSYS;
mr->page_size = page_size;
- return mr->device->map_mr_sg(mr, sg, sg_nents);
+ return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset);
}
EXPORT_SYMBOL(ib_map_mr_sg);
@@ -1635,6 +1844,10 @@ EXPORT_SYMBOL(ib_map_mr_sg);
* @mr: memory region
* @sgl: dma mapped scatterlist
* @sg_nents: number of entries in sg
+ * @sg_offset_p: IN: start offset in bytes into sg
+ * OUT: offset in bytes for element n of the sg of the first
+ * byte that has not been processed where n is the return
+ * value of this function.
* @set_page: driver page assignment function pointer
*
* Core service helper for drivers to convert the largest
@@ -1645,23 +1858,26 @@ EXPORT_SYMBOL(ib_map_mr_sg);
* Returns the number of sg elements that were assigned to
* a page vector.
*/
-int ib_sg_to_pages(struct ib_mr *mr,
- struct scatterlist *sgl,
- int sg_nents,
- int (*set_page)(struct ib_mr *, u64))
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+ unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64))
{
struct scatterlist *sg;
u64 last_end_dma_addr = 0;
+ unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
unsigned int last_page_off = 0;
u64 page_mask = ~((u64)mr->page_size - 1);
int i, ret;
- mr->iova = sg_dma_address(&sgl[0]);
+ if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0])))
+ return -EINVAL;
+
+ mr->iova = sg_dma_address(&sgl[0]) + sg_offset;
mr->length = 0;
for_each_sg(sgl, sg, sg_nents, i) {
- u64 dma_addr = sg_dma_address(sg);
- unsigned int dma_len = sg_dma_len(sg);
+ u64 dma_addr = sg_dma_address(sg) + sg_offset;
+ u64 prev_addr = dma_addr;
+ unsigned int dma_len = sg_dma_len(sg) - sg_offset;
u64 end_dma_addr = dma_addr + dma_len;
u64 page_addr = dma_addr & page_mask;
@@ -1685,8 +1901,14 @@ int ib_sg_to_pages(struct ib_mr *mr,
do {
ret = set_page(mr, page_addr);
- if (unlikely(ret < 0))
- return i ? : ret;
+ if (unlikely(ret < 0)) {
+ sg_offset = prev_addr - sg_dma_address(sg);
+ mr->length += prev_addr - dma_addr;
+ if (sg_offset_p)
+ *sg_offset_p = sg_offset;
+ return i || sg_offset ? i : ret;
+ }
+ prev_addr = page_addr;
next_page:
page_addr += mr->page_size;
} while (page_addr < end_dma_addr);
@@ -1694,8 +1916,12 @@ next_page:
mr->length += dma_len;
last_end_dma_addr = end_dma_addr;
last_page_off = end_dma_addr & ~page_mask;
+
+ sg_offset = 0;
}
+ if (sg_offset_p)
+ *sg_offset_p = 0;
return i;
}
EXPORT_SYMBOL(ib_sg_to_pages);
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index c7ad0a4c8b15..c0c7cf8af3f4 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/
obj-$(CONFIG_INFINIBAND_NES) += nes/
obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/
obj-$(CONFIG_INFINIBAND_USNIC) += usnic/
+obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index de1c61b417d6..ada2e5009c86 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -327,7 +327,7 @@ int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
kfree(cq->sw_queue);
dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
(1UL << (cq->size_log2))
- * sizeof(struct t3_cqe), cq->queue,
+ * sizeof(struct t3_cqe) + 1, cq->queue,
dma_unmap_addr(cq, mapping));
cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
return err;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index d403231a4aff..04bbf172abde 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -367,7 +367,7 @@ static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb)
*/
static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
{
- printk(KERN_ERR MOD "ARP failure duing connect\n");
+ printk(KERN_ERR MOD "ARP failure during connect\n");
kfree_skb(skb);
}
@@ -1396,10 +1396,10 @@ static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
state_set(&child_ep->com, CONNECTING);
child_ep->com.tdev = tdev;
child_ep->com.cm_id = NULL;
- child_ep->com.local_addr.sin_family = PF_INET;
+ child_ep->com.local_addr.sin_family = AF_INET;
child_ep->com.local_addr.sin_port = req->local_port;
child_ep->com.local_addr.sin_addr.s_addr = req->local_ip;
- child_ep->com.remote_addr.sin_family = PF_INET;
+ child_ep->com.remote_addr.sin_family = AF_INET;
child_ep->com.remote_addr.sin_port = req->peer_port;
child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip;
get_ep(&parent_ep->com);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 3234a8be16f6..3edb80644b53 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -783,15 +783,14 @@ static int iwch_set_page(struct ib_mr *ibmr, u64 addr)
return 0;
}
-static int iwch_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents)
+static int iwch_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+ int sg_nents, unsigned int *sg_offset)
{
struct iwch_mr *mhp = to_iwch_mr(ibmr);
mhp->npages = 0;
- return ib_sg_to_pages(ibmr, sg, sg_nents, iwch_set_page);
+ return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, iwch_set_page);
}
static int iwch_destroy_qp(struct ib_qp *ib_qp)
@@ -1184,18 +1183,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
}
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf)
-{
- struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
- ibdev.dev);
- struct ethtool_drvinfo info;
- struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
-
- PDBG("%s dev 0x%p\n", __func__, dev);
- lldev->ethtool_ops->get_drvinfo(lldev, &info);
- return sprintf(buf, "%s\n", info.fw_version);
-}
-
static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -1219,69 +1206,127 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
iwch_dev->rdev.rnic_info.pdev->device);
}
-static int iwch_get_mib(struct ib_device *ibdev,
- union rdma_protocol_stats *stats)
+enum counters {
+ IPINRECEIVES,
+ IPINHDRERRORS,
+ IPINADDRERRORS,
+ IPINUNKNOWNPROTOS,
+ IPINDISCARDS,
+ IPINDELIVERS,
+ IPOUTREQUESTS,
+ IPOUTDISCARDS,
+ IPOUTNOROUTES,
+ IPREASMTIMEOUT,
+ IPREASMREQDS,
+ IPREASMOKS,
+ IPREASMFAILS,
+ TCPACTIVEOPENS,
+ TCPPASSIVEOPENS,
+ TCPATTEMPTFAILS,
+ TCPESTABRESETS,
+ TCPCURRESTAB,
+ TCPINSEGS,
+ TCPOUTSEGS,
+ TCPRETRANSSEGS,
+ TCPINERRS,
+ TCPOUTRSTS,
+ TCPRTOMIN,
+ TCPRTOMAX,
+ NR_COUNTERS
+};
+
+static const char * const names[] = {
+ [IPINRECEIVES] = "ipInReceives",
+ [IPINHDRERRORS] = "ipInHdrErrors",
+ [IPINADDRERRORS] = "ipInAddrErrors",
+ [IPINUNKNOWNPROTOS] = "ipInUnknownProtos",
+ [IPINDISCARDS] = "ipInDiscards",
+ [IPINDELIVERS] = "ipInDelivers",
+ [IPOUTREQUESTS] = "ipOutRequests",
+ [IPOUTDISCARDS] = "ipOutDiscards",
+ [IPOUTNOROUTES] = "ipOutNoRoutes",
+ [IPREASMTIMEOUT] = "ipReasmTimeout",
+ [IPREASMREQDS] = "ipReasmReqds",
+ [IPREASMOKS] = "ipReasmOKs",
+ [IPREASMFAILS] = "ipReasmFails",
+ [TCPACTIVEOPENS] = "tcpActiveOpens",
+ [TCPPASSIVEOPENS] = "tcpPassiveOpens",
+ [TCPATTEMPTFAILS] = "tcpAttemptFails",
+ [TCPESTABRESETS] = "tcpEstabResets",
+ [TCPCURRESTAB] = "tcpCurrEstab",
+ [TCPINSEGS] = "tcpInSegs",
+ [TCPOUTSEGS] = "tcpOutSegs",
+ [TCPRETRANSSEGS] = "tcpRetransSegs",
+ [TCPINERRS] = "tcpInErrs",
+ [TCPOUTRSTS] = "tcpOutRsts",
+ [TCPRTOMIN] = "tcpRtoMin",
+ [TCPRTOMAX] = "tcpRtoMax",
+};
+
+static struct rdma_hw_stats *iwch_alloc_stats(struct ib_device *ibdev,
+ u8 port_num)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
+
+ /* Our driver only supports device level stats */
+ if (port_num != 0)
+ return NULL;
+
+ return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
+ RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+ u8 port, int index)
{
struct iwch_dev *dev;
struct tp_mib_stats m;
int ret;
+ if (port != 0 || !stats)
+ return -ENOSYS;
+
PDBG("%s ibdev %p\n", __func__, ibdev);
dev = to_iwch_dev(ibdev);
ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m);
if (ret)
return -ENOSYS;
- memset(stats, 0, sizeof *stats);
- stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) +
- m.ipInReceive_lo;
- stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) +
- m.ipInHdrErrors_lo;
- stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) +
- m.ipInAddrErrors_lo;
- stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) +
- m.ipInUnknownProtos_lo;
- stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) +
- m.ipInDiscards_lo;
- stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) +
- m.ipInDelivers_lo;
- stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) +
- m.ipOutRequests_lo;
- stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) +
- m.ipOutDiscards_lo;
- stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) +
- m.ipOutNoRoutes_lo;
- stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout;
- stats->iw.ipReasmReqds = (u64) m.ipReasmReqds;
- stats->iw.ipReasmOKs = (u64) m.ipReasmOKs;
- stats->iw.ipReasmFails = (u64) m.ipReasmFails;
- stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens;
- stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens;
- stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails;
- stats->iw.tcpEstabResets = (u64) m.tcpEstabResets;
- stats->iw.tcpOutRsts = (u64) m.tcpOutRsts;
- stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab;
- stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) +
- m.tcpInSegs_lo;
- stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) +
- m.tcpOutSegs_lo;
- stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) +
- m.tcpRetransSeg_lo;
- stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) +
- m.tcpInErrs_lo;
- stats->iw.tcpRtoMin = (u64) m.tcpRtoMin;
- stats->iw.tcpRtoMax = (u64) m.tcpRtoMax;
- return 0;
+ stats->value[IPINRECEIVES] = ((u64)m.ipInReceive_hi << 32) + m.ipInReceive_lo;
+ stats->value[IPINHDRERRORS] = ((u64)m.ipInHdrErrors_hi << 32) + m.ipInHdrErrors_lo;
+ stats->value[IPINADDRERRORS] = ((u64)m.ipInAddrErrors_hi << 32) + m.ipInAddrErrors_lo;
+ stats->value[IPINUNKNOWNPROTOS] = ((u64)m.ipInUnknownProtos_hi << 32) + m.ipInUnknownProtos_lo;
+ stats->value[IPINDISCARDS] = ((u64)m.ipInDiscards_hi << 32) + m.ipInDiscards_lo;
+ stats->value[IPINDELIVERS] = ((u64)m.ipInDelivers_hi << 32) + m.ipInDelivers_lo;
+ stats->value[IPOUTREQUESTS] = ((u64)m.ipOutRequests_hi << 32) + m.ipOutRequests_lo;
+ stats->value[IPOUTDISCARDS] = ((u64)m.ipOutDiscards_hi << 32) + m.ipOutDiscards_lo;
+ stats->value[IPOUTNOROUTES] = ((u64)m.ipOutNoRoutes_hi << 32) + m.ipOutNoRoutes_lo;
+ stats->value[IPREASMTIMEOUT] = m.ipReasmTimeout;
+ stats->value[IPREASMREQDS] = m.ipReasmReqds;
+ stats->value[IPREASMOKS] = m.ipReasmOKs;
+ stats->value[IPREASMFAILS] = m.ipReasmFails;
+ stats->value[TCPACTIVEOPENS] = m.tcpActiveOpens;
+ stats->value[TCPPASSIVEOPENS] = m.tcpPassiveOpens;
+ stats->value[TCPATTEMPTFAILS] = m.tcpAttemptFails;
+ stats->value[TCPESTABRESETS] = m.tcpEstabResets;
+ stats->value[TCPCURRESTAB] = m.tcpOutRsts;
+ stats->value[TCPINSEGS] = m.tcpCurrEstab;
+ stats->value[TCPOUTSEGS] = ((u64)m.tcpInSegs_hi << 32) + m.tcpInSegs_lo;
+ stats->value[TCPRETRANSSEGS] = ((u64)m.tcpOutSegs_hi << 32) + m.tcpOutSegs_lo;
+ stats->value[TCPINERRS] = ((u64)m.tcpRetransSeg_hi << 32) + m.tcpRetransSeg_lo,
+ stats->value[TCPOUTRSTS] = ((u64)m.tcpInErrs_hi << 32) + m.tcpInErrs_lo;
+ stats->value[TCPRTOMIN] = m.tcpRtoMin;
+ stats->value[TCPRTOMAX] = m.tcpRtoMax;
+
+ return stats->num_counters;
}
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
static struct device_attribute *iwch_class_attributes[] = {
&dev_attr_hw_rev,
- &dev_attr_fw_ver,
&dev_attr_hca_type,
&dev_attr_board_id,
};
@@ -1303,6 +1348,18 @@ static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num,
return 0;
}
+static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str,
+ size_t str_len)
+{
+ struct iwch_dev *iwch_dev = to_iwch_dev(ibdev);
+ struct ethtool_drvinfo info;
+ struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+
+ PDBG("%s dev 0x%p\n", __func__, iwch_dev);
+ lldev->ethtool_ops->get_drvinfo(lldev, &info);
+ snprintf(str, str_len, "%s", info.fw_version);
+}
+
int iwch_register_device(struct iwch_dev *dev)
{
int ret;
@@ -1374,9 +1431,11 @@ int iwch_register_device(struct iwch_dev *dev)
dev->ibdev.req_notify_cq = iwch_arm_cq;
dev->ibdev.post_send = iwch_post_send;
dev->ibdev.post_recv = iwch_post_receive;
- dev->ibdev.get_protocol_stats = iwch_get_mib;
+ dev->ibdev.alloc_hw_stats = iwch_alloc_stats;
+ dev->ibdev.get_hw_stats = iwch_get_mib;
dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
dev->ibdev.get_port_immutable = iwch_port_immutable;
+ dev->ibdev.get_dev_fw_str = get_dev_fw_ver_str;
dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
if (!dev->ibdev.iwcm)
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 651711370d55..80f988984f44 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -119,7 +119,7 @@ MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
static int mpa_rev = 2;
module_param(mpa_rev, int, 0644);
MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
- "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft"
+ "1 is RFC5044 spec compliant, 2 is IETF MPA Peer Connect Draft"
" compliant (default=2)");
static int markers_enabled;
@@ -145,19 +145,35 @@ static struct sk_buff_head rxq;
static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
static void ep_timeout(unsigned long arg);
static void connect_reply_upcall(struct c4iw_ep *ep, int status);
+static int sched(struct c4iw_dev *dev, struct sk_buff *skb);
static LIST_HEAD(timeout_list);
static spinlock_t timeout_lock;
+static void deref_cm_id(struct c4iw_ep_common *epc)
+{
+ epc->cm_id->rem_ref(epc->cm_id);
+ epc->cm_id = NULL;
+ set_bit(CM_ID_DEREFED, &epc->history);
+}
+
+static void ref_cm_id(struct c4iw_ep_common *epc)
+{
+ set_bit(CM_ID_REFED, &epc->history);
+ epc->cm_id->add_ref(epc->cm_id);
+}
+
static void deref_qp(struct c4iw_ep *ep)
{
c4iw_qp_rem_ref(&ep->com.qp->ibqp);
clear_bit(QP_REFERENCED, &ep->com.flags);
+ set_bit(QP_DEREFED, &ep->com.history);
}
static void ref_qp(struct c4iw_ep *ep)
{
set_bit(QP_REFERENCED, &ep->com.flags);
+ set_bit(QP_REFED, &ep->com.history);
c4iw_qp_add_ref(&ep->com.qp->ibqp);
}
@@ -201,6 +217,8 @@ static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb,
error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e);
if (error < 0)
kfree_skb(skb);
+ else if (error == NET_XMIT_DROP)
+ return -ENOMEM;
return error < 0 ? error : 0;
}
@@ -276,6 +294,25 @@ static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
return;
}
+static int alloc_ep_skb_list(struct sk_buff_head *ep_skb_list, int size)
+{
+ struct sk_buff *skb;
+ unsigned int i;
+ size_t len;
+
+ len = roundup(sizeof(union cpl_wr_size), 16);
+ for (i = 0; i < size; i++) {
+ skb = alloc_skb(len, GFP_KERNEL);
+ if (!skb)
+ goto fail;
+ skb_queue_tail(ep_skb_list, skb);
+ }
+ return 0;
+fail:
+ skb_queue_purge(ep_skb_list);
+ return -ENOMEM;
+}
+
static void *alloc_ep(int size, gfp_t gfp)
{
struct c4iw_ep_common *epc;
@@ -290,12 +327,65 @@ static void *alloc_ep(int size, gfp_t gfp)
return epc;
}
+static void remove_ep_tid(struct c4iw_ep *ep)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ep->com.dev->lock, flags);
+ _remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid, 0);
+ if (idr_is_empty(&ep->com.dev->hwtid_idr))
+ wake_up(&ep->com.dev->wait);
+ spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+}
+
+static void insert_ep_tid(struct c4iw_ep *ep)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ep->com.dev->lock, flags);
+ _insert_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep, ep->hwtid, 0);
+ spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+}
+
+/*
+ * Atomically lookup the ep ptr given the tid and grab a reference on the ep.
+ */
+static struct c4iw_ep *get_ep_from_tid(struct c4iw_dev *dev, unsigned int tid)
+{
+ struct c4iw_ep *ep;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ ep = idr_find(&dev->hwtid_idr, tid);
+ if (ep)
+ c4iw_get_ep(&ep->com);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return ep;
+}
+
+/*
+ * Atomically lookup the ep ptr given the stid and grab a reference on the ep.
+ */
+static struct c4iw_listen_ep *get_ep_from_stid(struct c4iw_dev *dev,
+ unsigned int stid)
+{
+ struct c4iw_listen_ep *ep;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->lock, flags);
+ ep = idr_find(&dev->stid_idr, stid);
+ if (ep)
+ c4iw_get_ep(&ep->com);
+ spin_unlock_irqrestore(&dev->lock, flags);
+ return ep;
+}
+
void _c4iw_free_ep(struct kref *kref)
{
struct c4iw_ep *ep;
ep = container_of(kref, struct c4iw_ep, com.kref);
- PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]);
+ PDBG("%s ep %p state %s\n", __func__, ep, states[ep->com.state]);
if (test_bit(QP_REFERENCED, &ep->com.flags))
deref_qp(ep);
if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
@@ -309,17 +399,29 @@ void _c4iw_free_ep(struct kref *kref)
(const u32 *)&sin6->sin6_addr.s6_addr,
1);
}
- remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
dst_release(ep->dst);
cxgb4_l2t_release(ep->l2t);
+ if (ep->mpa_skb)
+ kfree_skb(ep->mpa_skb);
}
+ if (!skb_queue_empty(&ep->com.ep_skb_list))
+ skb_queue_purge(&ep->com.ep_skb_list);
kfree(ep);
}
static void release_ep_resources(struct c4iw_ep *ep)
{
set_bit(RELEASE_RESOURCES, &ep->com.flags);
+
+ /*
+ * If we have a hwtid, then remove it from the idr table
+ * so lookups will no longer find this endpoint. Otherwise
+ * we have a race where one thread finds the ep ptr just
+ * before the other thread is freeing the ep memory.
+ */
+ if (ep->hwtid != -1)
+ remove_ep_tid(ep);
c4iw_put_ep(&ep->com);
}
@@ -432,10 +534,74 @@ static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip,
static void arp_failure_discard(void *handle, struct sk_buff *skb)
{
- PDBG("%s c4iw_dev %p\n", __func__, handle);
+ pr_err(MOD "ARP failure\n");
kfree_skb(skb);
}
+static void mpa_start_arp_failure(void *handle, struct sk_buff *skb)
+{
+ pr_err("ARP failure during MPA Negotiation - Closing Connection\n");
+}
+
+enum {
+ NUM_FAKE_CPLS = 2,
+ FAKE_CPL_PUT_EP_SAFE = NUM_CPL_CMDS + 0,
+ FAKE_CPL_PASS_PUT_EP_SAFE = NUM_CPL_CMDS + 1,
+};
+
+static int _put_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+ struct c4iw_ep *ep;
+
+ ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
+ release_ep_resources(ep);
+ return 0;
+}
+
+static int _put_pass_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+ struct c4iw_ep *ep;
+
+ ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *)));
+ c4iw_put_ep(&ep->parent_ep->com);
+ release_ep_resources(ep);
+ return 0;
+}
+
+/*
+ * Fake up a special CPL opcode and call sched() so process_work() will call
+ * _put_ep_safe() in a safe context to free the ep resources. This is needed
+ * because ARP error handlers are called in an ATOMIC context, and
+ * _c4iw_free_ep() needs to block.
+ */
+static void queue_arp_failure_cpl(struct c4iw_ep *ep, struct sk_buff *skb,
+ int cpl)
+{
+ struct cpl_act_establish *rpl = cplhdr(skb);
+
+ /* Set our special ARP_FAILURE opcode */
+ rpl->ot.opcode = cpl;
+
+ /*
+ * Save ep in the skb->cb area, after where sched() will save the dev
+ * ptr.
+ */
+ *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))) = ep;
+ sched(ep->com.dev, skb);
+}
+
+/* Handle an ARP failure for an accept */
+static void pass_accept_rpl_arp_failure(void *handle, struct sk_buff *skb)
+{
+ struct c4iw_ep *ep = handle;
+
+ pr_err(MOD "ARP failure during accept - tid %u -dropping connection\n",
+ ep->hwtid);
+
+ __state_set(&ep->com, DEAD);
+ queue_arp_failure_cpl(ep, skb, FAKE_CPL_PASS_PUT_EP_SAFE);
+}
+
/*
* Handle an ARP failure for an active open.
*/
@@ -444,9 +610,8 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
struct c4iw_ep *ep = handle;
printk(KERN_ERR MOD "ARP failure during connect\n");
- kfree_skb(skb);
connect_reply_upcall(ep, -EHOSTUNREACH);
- state_set(&ep->com, DEAD);
+ __state_set(&ep->com, DEAD);
if (ep->com.remote_addr.ss_family == AF_INET6) {
struct sockaddr_in6 *sin6 =
(struct sockaddr_in6 *)&ep->com.local_addr;
@@ -455,9 +620,7 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
}
remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
- dst_release(ep->dst);
- cxgb4_l2t_release(ep->l2t);
- c4iw_put_ep(&ep->com);
+ queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE);
}
/*
@@ -466,33 +629,41 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
*/
static void abort_arp_failure(void *handle, struct sk_buff *skb)
{
- struct c4iw_rdev *rdev = handle;
+ int ret;
+ struct c4iw_ep *ep = handle;
+ struct c4iw_rdev *rdev = &ep->com.dev->rdev;
struct cpl_abort_req *req = cplhdr(skb);
PDBG("%s rdev %p\n", __func__, rdev);
req->cmd = CPL_ABORT_NO_RST;
- c4iw_ofld_send(rdev, skb);
+ ret = c4iw_ofld_send(rdev, skb);
+ if (ret) {
+ __state_set(&ep->com, DEAD);
+ queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE);
+ }
}
-static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
+static int send_flowc(struct c4iw_ep *ep)
{
- unsigned int flowclen = 80;
struct fw_flowc_wr *flowc;
+ struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list);
int i;
u16 vlan = ep->l2t->vlan;
int nparams;
+ if (WARN_ON(!skb))
+ return -ENOMEM;
+
if (vlan == CPL_L2T_VLAN_NONE)
nparams = 8;
else
nparams = 9;
- skb = get_skb(skb, flowclen, GFP_KERNEL);
- flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen);
+ flowc = (struct fw_flowc_wr *)__skb_put(skb, FLOWC_LEN);
flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
FW_FLOWC_WR_NPARAMS_V(nparams));
- flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(flowclen,
+ flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(FLOWC_LEN,
16)) | FW_WR_FLOWID_V(ep->hwtid));
flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
@@ -530,21 +701,19 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
}
set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
- c4iw_ofld_send(&ep->com.dev->rdev, skb);
+ return c4iw_ofld_send(&ep->com.dev->rdev, skb);
}
-static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
+static int send_halfclose(struct c4iw_ep *ep)
{
struct cpl_close_con_req *req;
- struct sk_buff *skb;
+ struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list);
int wrlen = roundup(sizeof *req, 16);
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
- skb = get_skb(NULL, wrlen, gfp);
- if (!skb) {
- printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+ if (WARN_ON(!skb))
return -ENOMEM;
- }
+
set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
req = (struct cpl_close_con_req *) skb_put(skb, wrlen);
@@ -555,26 +724,24 @@ static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
}
-static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
+static int send_abort(struct c4iw_ep *ep)
{
struct cpl_abort_req *req;
int wrlen = roundup(sizeof *req, 16);
+ struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list);
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
- skb = get_skb(skb, wrlen, gfp);
- if (!skb) {
- printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
- __func__);
+ if (WARN_ON(!req_skb))
return -ENOMEM;
- }
- set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
- t4_set_arp_err_handler(skb, &ep->com.dev->rdev, abort_arp_failure);
- req = (struct cpl_abort_req *) skb_put(skb, wrlen);
+
+ set_wr_txq(req_skb, CPL_PRIORITY_DATA, ep->txq_idx);
+ t4_set_arp_err_handler(req_skb, ep, abort_arp_failure);
+ req = (struct cpl_abort_req *)skb_put(req_skb, wrlen);
memset(req, 0, wrlen);
INIT_TP_WR(req, ep->hwtid);
OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
req->cmd = CPL_ABORT_SEND_RST;
- return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+ return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t);
}
static void best_mtu(const unsigned short *mtus, unsigned short mtu,
@@ -807,10 +974,10 @@ clip_release:
return ret;
}
-static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
- u8 mpa_rev_to_use)
+static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
+ u8 mpa_rev_to_use)
{
- int mpalen, wrlen;
+ int mpalen, wrlen, ret;
struct fw_ofld_tx_data_wr *req;
struct mpa_message *mpa;
struct mpa_v2_conn_params mpa_v2_params;
@@ -826,7 +993,7 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
skb = get_skb(skb, wrlen, GFP_KERNEL);
if (!skb) {
connect_reply_upcall(ep, -ENOMEM);
- return;
+ return -ENOMEM;
}
set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
@@ -846,9 +1013,19 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
mpa = (struct mpa_message *)(req + 1);
memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
- mpa->flags = (crc_enabled ? MPA_CRC : 0) |
- (markers_enabled ? MPA_MARKERS : 0) |
- (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0);
+
+ mpa->flags = 0;
+ if (crc_enabled)
+ mpa->flags |= MPA_CRC;
+ if (markers_enabled) {
+ mpa->flags |= MPA_MARKERS;
+ ep->mpa_attr.recv_marker_enabled = 1;
+ } else {
+ ep->mpa_attr.recv_marker_enabled = 0;
+ }
+ if (mpa_rev_to_use == 2)
+ mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+
mpa->private_data_size = htons(ep->plen);
mpa->revision = mpa_rev_to_use;
if (mpa_rev_to_use == 1) {
@@ -894,12 +1071,14 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
BUG_ON(ep->mpa_skb);
ep->mpa_skb = skb;
- c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+ ret = c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+ if (ret)
+ return ret;
start_ep_timer(ep);
__state_set(&ep->com, MPA_REQ_SENT);
ep->mpa_attr.initiator = 1;
ep->snd_seq += mpalen;
- return;
+ return ret;
}
static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
@@ -975,7 +1154,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
*/
skb_get(skb);
set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
- t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+ t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure);
BUG_ON(ep->mpa_skb);
ep->mpa_skb = skb;
ep->snd_seq += mpalen;
@@ -1021,8 +1200,11 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
mpa = (struct mpa_message *)(req + 1);
memset(mpa, 0, sizeof(*mpa));
memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
- mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
- (markers_enabled ? MPA_MARKERS : 0);
+ mpa->flags = 0;
+ if (ep->mpa_attr.crc_enabled)
+ mpa->flags |= MPA_CRC;
+ if (ep->mpa_attr.recv_marker_enabled)
+ mpa->flags |= MPA_MARKERS;
mpa->revision = ep->mpa_attr.version;
mpa->private_data_size = htons(plen);
@@ -1060,7 +1242,7 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
* Function fw4_ack() will deref it.
*/
skb_get(skb);
- t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+ t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure);
ep->mpa_skb = skb;
__state_set(&ep->com, MPA_REP_SENT);
ep->snd_seq += mpalen;
@@ -1074,6 +1256,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
unsigned int tid = GET_TID(req);
unsigned int atid = TID_TID_G(ntohl(req->tos_atid));
struct tid_info *t = dev->rdev.lldi.tids;
+ int ret;
ep = lookup_atid(t, atid);
@@ -1086,7 +1269,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
/* setup the hwtid for this connection */
ep->hwtid = tid;
cxgb4_insert_tid(t, ep, tid);
- insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid);
+ insert_ep_tid(ep);
ep->snd_seq = be32_to_cpu(req->snd_isn);
ep->rcv_seq = be32_to_cpu(req->rcv_isn);
@@ -1099,12 +1282,21 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
set_bit(ACT_ESTAB, &ep->com.history);
/* start MPA negotiation */
- send_flowc(ep, NULL);
+ ret = send_flowc(ep);
+ if (ret)
+ goto err;
if (ep->retry_with_mpa_v1)
- send_mpa_req(ep, skb, 1);
+ ret = send_mpa_req(ep, skb, 1);
else
- send_mpa_req(ep, skb, mpa_rev);
+ ret = send_mpa_req(ep, skb, mpa_rev);
+ if (ret)
+ goto err;
+ mutex_unlock(&ep->com.mutex);
+ return 0;
+err:
mutex_unlock(&ep->com.mutex);
+ connect_reply_upcall(ep, -ENOMEM);
+ c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
return 0;
}
@@ -1120,20 +1312,11 @@ static void close_complete_upcall(struct c4iw_ep *ep, int status)
PDBG("close complete delivered ep %p cm_id %p tid %u\n",
ep, ep->com.cm_id, ep->hwtid);
ep->com.cm_id->event_handler(ep->com.cm_id, &event);
- ep->com.cm_id->rem_ref(ep->com.cm_id);
- ep->com.cm_id = NULL;
+ deref_cm_id(&ep->com);
set_bit(CLOSE_UPCALL, &ep->com.history);
}
}
-static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
-{
- PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
- __state_set(&ep->com, ABORTING);
- set_bit(ABORT_CONN, &ep->com.history);
- return send_abort(ep, skb, gfp);
-}
-
static void peer_close_upcall(struct c4iw_ep *ep)
{
struct iw_cm_event event;
@@ -1161,8 +1344,7 @@ static void peer_abort_upcall(struct c4iw_ep *ep)
PDBG("abort delivered ep %p cm_id %p tid %u\n", ep,
ep->com.cm_id, ep->hwtid);
ep->com.cm_id->event_handler(ep->com.cm_id, &event);
- ep->com.cm_id->rem_ref(ep->com.cm_id);
- ep->com.cm_id = NULL;
+ deref_cm_id(&ep->com);
set_bit(ABORT_UPCALL, &ep->com.history);
}
}
@@ -1205,10 +1387,8 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status)
set_bit(CONN_RPL_UPCALL, &ep->com.history);
ep->com.cm_id->event_handler(ep->com.cm_id, &event);
- if (status < 0) {
- ep->com.cm_id->rem_ref(ep->com.cm_id);
- ep->com.cm_id = NULL;
- }
+ if (status < 0)
+ deref_cm_id(&ep->com);
}
static int connect_request_upcall(struct c4iw_ep *ep)
@@ -1301,6 +1481,18 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits)
#define RELAXED_IRD_NEGOTIATION 1
+/*
+ * process_mpa_reply - process streaming mode MPA reply
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
{
struct mpa_message *mpa;
@@ -1316,20 +1508,12 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
/*
- * Stop mpa timer. If it expired, then
- * we ignore the MPA reply. process_timeout()
- * will abort the connection.
- */
- if (stop_ep_timer(ep))
- return 0;
-
- /*
* If we get more than the supported amount of private data
* then we must fail this connection.
*/
if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
err = -EINVAL;
- goto err;
+ goto err_stop_timer;
}
/*
@@ -1351,11 +1535,11 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
" Received = %d\n", __func__, mpa_rev, mpa->revision);
err = -EPROTO;
- goto err;
+ goto err_stop_timer;
}
if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
err = -EPROTO;
- goto err;
+ goto err_stop_timer;
}
plen = ntohs(mpa->private_data_size);
@@ -1365,7 +1549,7 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
*/
if (plen > MPA_MAX_PRIVATE_DATA) {
err = -EPROTO;
- goto err;
+ goto err_stop_timer;
}
/*
@@ -1373,7 +1557,7 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
*/
if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
err = -EPROTO;
- goto err;
+ goto err_stop_timer;
}
ep->plen = (u8) plen;
@@ -1387,17 +1571,24 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
if (mpa->flags & MPA_REJECT) {
err = -ECONNREFUSED;
- goto err;
+ goto err_stop_timer;
}
/*
+ * Stop mpa timer. If it expired, then
+ * we ignore the MPA reply. process_timeout()
+ * will abort the connection.
+ */
+ if (stop_ep_timer(ep))
+ return 0;
+
+ /*
* If we get here we have accumulated the entire mpa
* start reply message including private data. And
* the MPA header is valid.
*/
__state_set(&ep->com, FPDU_MODE);
ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
- ep->mpa_attr.recv_marker_enabled = markers_enabled;
ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
ep->mpa_attr.version = mpa->revision;
ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
@@ -1529,15 +1720,28 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
goto out;
}
goto out;
+err_stop_timer:
+ stop_ep_timer(ep);
err:
- __state_set(&ep->com, ABORTING);
- send_abort(ep, skb, GFP_KERNEL);
+ disconnect = 2;
out:
connect_reply_upcall(ep, err);
return disconnect;
}
-static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
+/*
+ * process_mpa_request - process streaming mode MPA request
+ *
+ * Returns:
+ *
+ * 0 upon success indicating a connect request was delivered to the ULP
+ * or the mpa request is incomplete but valid so far.
+ *
+ * 1 if a failure requires the caller to close the connection.
+ *
+ * 2 if a failure requires the caller to abort the connection.
+ */
+static int process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
{
struct mpa_message *mpa;
struct mpa_v2_conn_params *mpa_v2_params;
@@ -1549,11 +1753,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
* If we get more than the supported amount of private data
* then we must fail this connection.
*/
- if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
- (void)stop_ep_timer(ep);
- abort_connection(ep, skb, GFP_KERNEL);
- return;
- }
+ if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt))
+ goto err_stop_timer;
PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
@@ -1569,7 +1770,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
* We'll continue process when more data arrives.
*/
if (ep->mpa_pkt_len < sizeof(*mpa))
- return;
+ return 0;
PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
mpa = (struct mpa_message *) ep->mpa_pkt;
@@ -1580,43 +1781,32 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
if (mpa->revision > mpa_rev) {
printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
" Received = %d\n", __func__, mpa_rev, mpa->revision);
- (void)stop_ep_timer(ep);
- abort_connection(ep, skb, GFP_KERNEL);
- return;
+ goto err_stop_timer;
}
- if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
- (void)stop_ep_timer(ep);
- abort_connection(ep, skb, GFP_KERNEL);
- return;
- }
+ if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)))
+ goto err_stop_timer;
plen = ntohs(mpa->private_data_size);
/*
* Fail if there's too much private data.
*/
- if (plen > MPA_MAX_PRIVATE_DATA) {
- (void)stop_ep_timer(ep);
- abort_connection(ep, skb, GFP_KERNEL);
- return;
- }
+ if (plen > MPA_MAX_PRIVATE_DATA)
+ goto err_stop_timer;
/*
* If plen does not account for pkt size
*/
- if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
- (void)stop_ep_timer(ep);
- abort_connection(ep, skb, GFP_KERNEL);
- return;
- }
+ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen))
+ goto err_stop_timer;
ep->plen = (u8) plen;
/*
* If we don't have all the pdata yet, then bail.
*/
if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
- return;
+ return 0;
/*
* If we get here we have accumulated the entire mpa
@@ -1639,8 +1829,12 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
(ep->mpa_pkt + sizeof(*mpa));
ep->ird = ntohs(mpa_v2_params->ird) &
MPA_V2_IRD_ORD_MASK;
+ ep->ird = min_t(u32, ep->ird,
+ cur_max_read_depth(ep->com.dev));
ep->ord = ntohs(mpa_v2_params->ord) &
MPA_V2_IRD_ORD_MASK;
+ ep->ord = min_t(u32, ep->ord,
+ cur_max_read_depth(ep->com.dev));
PDBG("%s initiator ird %u ord %u\n", __func__, ep->ird,
ep->ord);
if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL)
@@ -1665,26 +1859,26 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
ep->mpa_attr.p2p_type);
- /*
- * If the endpoint timer already expired, then we ignore
- * the start request. process_timeout() will abort
- * the connection.
- */
- if (!stop_ep_timer(ep)) {
- __state_set(&ep->com, MPA_REQ_RCVD);
-
- /* drive upcall */
- mutex_lock_nested(&ep->parent_ep->com.mutex,
- SINGLE_DEPTH_NESTING);
- if (ep->parent_ep->com.state != DEAD) {
- if (connect_request_upcall(ep))
- abort_connection(ep, skb, GFP_KERNEL);
- } else {
- abort_connection(ep, skb, GFP_KERNEL);
- }
- mutex_unlock(&ep->parent_ep->com.mutex);
+ __state_set(&ep->com, MPA_REQ_RCVD);
+
+ /* drive upcall */
+ mutex_lock_nested(&ep->parent_ep->com.mutex, SINGLE_DEPTH_NESTING);
+ if (ep->parent_ep->com.state != DEAD) {
+ if (connect_request_upcall(ep))
+ goto err_unlock_parent;
+ } else {
+ goto err_unlock_parent;
}
- return;
+ mutex_unlock(&ep->parent_ep->com.mutex);
+ return 0;
+
+err_unlock_parent:
+ mutex_unlock(&ep->parent_ep->com.mutex);
+ goto err_out;
+err_stop_timer:
+ (void)stop_ep_timer(ep);
+err_out:
+ return 2;
}
static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
@@ -1693,11 +1887,10 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
struct cpl_rx_data *hdr = cplhdr(skb);
unsigned int dlen = ntohs(hdr->len);
unsigned int tid = GET_TID(hdr);
- struct tid_info *t = dev->rdev.lldi.tids;
__u8 status = hdr->status;
int disconnect = 0;
- ep = lookup_tid(t, tid);
+ ep = get_ep_from_tid(dev, tid);
if (!ep)
return 0;
PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen);
@@ -1715,7 +1908,7 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
break;
case MPA_REQ_WAIT:
ep->rcv_seq += dlen;
- process_mpa_request(ep, skb);
+ disconnect = process_mpa_request(ep, skb);
break;
case FPDU_MODE: {
struct c4iw_qp_attributes attrs;
@@ -1736,7 +1929,8 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
}
mutex_unlock(&ep->com.mutex);
if (disconnect)
- c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+ c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL);
+ c4iw_put_ep(&ep->com);
return 0;
}
@@ -1746,9 +1940,8 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
int release = 0;
unsigned int tid = GET_TID(rpl);
- struct tid_info *t = dev->rdev.lldi.tids;
- ep = lookup_tid(t, tid);
+ ep = get_ep_from_tid(dev, tid);
if (!ep) {
printk(KERN_WARNING MOD "Abort rpl to freed endpoint\n");
return 0;
@@ -1770,10 +1963,11 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
if (release)
release_ep_resources(ep);
+ c4iw_put_ep(&ep->com);
return 0;
}
-static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
+static int send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
{
struct sk_buff *skb;
struct fw_ofld_connection_wr *req;
@@ -1843,16 +2037,21 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
req->tcb.opt2 = cpu_to_be32((__force u32)req->tcb.opt2);
set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx);
set_bit(ACT_OFLD_CONN, &ep->com.history);
- c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+ return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
}
/*
- * Return whether a failed active open has allocated a TID
+ * Some of the error codes above implicitly indicate that there is no TID
+ * allocated with the result of an ACT_OPEN. We use this predicate to make
+ * that explicit.
*/
static inline int act_open_has_tid(int status)
{
- return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
- status != CPL_ERR_ARP_MISS;
+ return (status != CPL_ERR_TCAM_PARITY &&
+ status != CPL_ERR_TCAM_MISS &&
+ status != CPL_ERR_TCAM_FULL &&
+ status != CPL_ERR_CONN_EXIST_SYNRECV &&
+ status != CPL_ERR_CONN_EXIST);
}
/* Returns whether a CPL status conveys negative advice.
@@ -1920,8 +2119,10 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip,
}
ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
n, pdev, rt_tos2priority(tos));
- if (!ep->l2t)
+ if (!ep->l2t) {
+ dev_put(pdev);
goto out;
+ }
ep->mtu = pdev->mtu;
ep->tx_chan = cxgb4_port_chan(pdev);
ep->smac_idx = cxgb4_tp_smt_idx(adapter_type,
@@ -1973,6 +2174,7 @@ out:
static int c4iw_reconnect(struct c4iw_ep *ep)
{
int err = 0;
+ int size = 0;
struct sockaddr_in *laddr = (struct sockaddr_in *)
&ep->com.cm_id->m_local_addr;
struct sockaddr_in *raddr = (struct sockaddr_in *)
@@ -1986,6 +2188,22 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id);
init_timer(&ep->timer);
+ c4iw_init_wr_wait(&ep->com.wr_wait);
+
+ /* When MPA revision is different on nodes, the node with MPA_rev=2
+ * tries to reconnect with MPA_rev 1 for the same EP through
+ * c4iw_reconnect(), where the same EP is assigned with new tid for
+ * further connection establishment. As we are using the same EP pointer
+ * for reconnect, few skbs are used during the previous c4iw_connect(),
+ * which leaves the EP with inadequate skbs for further
+ * c4iw_reconnect(), Further causing an assert BUG_ON() due to empty
+ * skb_list() during peer_abort(). Allocate skbs which is already used.
+ */
+ size = (CN_MAX_CON_BUF - skb_queue_len(&ep->com.ep_skb_list));
+ if (alloc_ep_skb_list(&ep->com.ep_skb_list, size)) {
+ err = -ENOMEM;
+ goto fail1;
+ }
/*
* Allocate an active TID to initiate a TCP connection.
@@ -2052,6 +2270,7 @@ fail2:
* response of 1st connect request.
*/
connect_reply_upcall(ep, -ECONNRESET);
+fail1:
c4iw_put_ep(&ep->com);
out:
return err;
@@ -2069,6 +2288,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
struct sockaddr_in *ra;
struct sockaddr_in6 *la6;
struct sockaddr_in6 *ra6;
+ int ret = 0;
ep = lookup_atid(t, atid);
la = (struct sockaddr_in *)&ep->com.local_addr;
@@ -2104,9 +2324,10 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
mutex_unlock(&dev->rdev.stats.lock);
if (ep->com.local_addr.ss_family == AF_INET &&
dev->rdev.lldi.enable_fw_ofld_conn) {
- send_fw_act_open_req(ep,
- TID_TID_G(AOPEN_ATID_G(
- ntohl(rpl->atid_status))));
+ ret = send_fw_act_open_req(ep, TID_TID_G(AOPEN_ATID_G(
+ ntohl(rpl->atid_status))));
+ if (ret)
+ goto fail;
return 0;
}
break;
@@ -2146,6 +2367,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
break;
}
+fail:
connect_reply_upcall(ep, status2errno(status));
state_set(&ep->com, DEAD);
@@ -2170,9 +2392,8 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct cpl_pass_open_rpl *rpl = cplhdr(skb);
- struct tid_info *t = dev->rdev.lldi.tids;
unsigned int stid = GET_TID(rpl);
- struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+ struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid);
if (!ep) {
PDBG("%s stid %d lookup failure!\n", __func__, stid);
@@ -2181,7 +2402,7 @@ static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
PDBG("%s ep %p status %d error %d\n", __func__, ep,
rpl->status, status2errno(rpl->status));
c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status));
-
+ c4iw_put_ep(&ep->com);
out:
return 0;
}
@@ -2189,17 +2410,17 @@ out:
static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct cpl_close_listsvr_rpl *rpl = cplhdr(skb);
- struct tid_info *t = dev->rdev.lldi.tids;
unsigned int stid = GET_TID(rpl);
- struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+ struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid);
PDBG("%s ep %p\n", __func__, ep);
c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status));
+ c4iw_put_ep(&ep->com);
return 0;
}
-static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
- struct cpl_pass_accept_req *req)
+static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
+ struct cpl_pass_accept_req *req)
{
struct cpl_pass_accept_rpl *rpl;
unsigned int mtu_idx;
@@ -2287,10 +2508,9 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
rpl->opt0 = cpu_to_be64(opt0);
rpl->opt2 = cpu_to_be32(opt2);
set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
- t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
- c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+ t4_set_arp_err_handler(skb, ep, pass_accept_rpl_arp_failure);
- return;
+ return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
}
static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb)
@@ -2355,7 +2575,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
unsigned short hdrs;
u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
- parent_ep = lookup_stid(t, stid);
+ parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid);
if (!parent_ep) {
PDBG("%s connect request on invalid stid %d\n", __func__, stid);
goto reject;
@@ -2417,6 +2637,10 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
if (peer_mss && child_ep->mtu > (peer_mss + hdrs))
child_ep->mtu = peer_mss + hdrs;
+ skb_queue_head_init(&child_ep->com.ep_skb_list);
+ if (alloc_ep_skb_list(&child_ep->com.ep_skb_list, CN_MAX_CON_BUF))
+ goto fail;
+
state_set(&child_ep->com, CONNECTING);
child_ep->com.dev = dev;
child_ep->com.cm_id = NULL;
@@ -2468,17 +2692,25 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
init_timer(&child_ep->timer);
cxgb4_insert_tid(t, child_ep, hwtid);
- insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid);
- accept_cr(child_ep, skb, req);
- set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+ insert_ep_tid(child_ep);
+ if (accept_cr(child_ep, skb, req)) {
+ c4iw_put_ep(&parent_ep->com);
+ release_ep_resources(child_ep);
+ } else {
+ set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+ }
if (iptype == 6) {
sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr;
cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0],
(const u32 *)&sin6->sin6_addr.s6_addr, 1);
}
goto out;
+fail:
+ c4iw_put_ep(&child_ep->com);
reject:
reject_cr(dev, hwtid, skb);
+ if (parent_ep)
+ c4iw_put_ep(&parent_ep->com);
out:
return 0;
}
@@ -2487,10 +2719,10 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct c4iw_ep *ep;
struct cpl_pass_establish *req = cplhdr(skb);
- struct tid_info *t = dev->rdev.lldi.tids;
unsigned int tid = GET_TID(req);
+ int ret;
- ep = lookup_tid(t, tid);
+ ep = get_ep_from_tid(dev, tid);
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
ep->snd_seq = be32_to_cpu(req->snd_isn);
ep->rcv_seq = be32_to_cpu(req->rcv_isn);
@@ -2501,10 +2733,15 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
set_emss(ep, ntohs(req->tcp_opt));
dst_confirm(ep->dst);
- state_set(&ep->com, MPA_REQ_WAIT);
+ mutex_lock(&ep->com.mutex);
+ ep->com.state = MPA_REQ_WAIT;
start_ep_timer(ep);
- send_flowc(ep, skb);
set_bit(PASS_ESTAB, &ep->com.history);
+ ret = send_flowc(ep);
+ mutex_unlock(&ep->com.mutex);
+ if (ret)
+ c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
+ c4iw_put_ep(&ep->com);
return 0;
}
@@ -2516,11 +2753,13 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
struct c4iw_qp_attributes attrs;
int disconnect = 1;
int release = 0;
- struct tid_info *t = dev->rdev.lldi.tids;
unsigned int tid = GET_TID(hdr);
int ret;
- ep = lookup_tid(t, tid);
+ ep = get_ep_from_tid(dev, tid);
+ if (!ep)
+ return 0;
+
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
dst_confirm(ep->dst);
@@ -2592,6 +2831,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
if (release)
release_ep_resources(ep);
+ c4iw_put_ep(&ep->com);
return 0;
}
@@ -2604,10 +2844,12 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
struct c4iw_qp_attributes attrs;
int ret;
int release = 0;
- struct tid_info *t = dev->rdev.lldi.tids;
unsigned int tid = GET_TID(req);
- ep = lookup_tid(t, tid);
+ ep = get_ep_from_tid(dev, tid);
+ if (!ep)
+ return 0;
+
if (is_neg_adv(req->status)) {
PDBG("%s Negative advice on abort- tid %u status %d (%s)\n",
__func__, ep->hwtid, req->status,
@@ -2616,7 +2858,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
mutex_lock(&dev->rdev.stats.lock);
dev->rdev.stats.neg_adv++;
mutex_unlock(&dev->rdev.stats.lock);
- return 0;
+ goto deref_ep;
}
PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
ep->com.state);
@@ -2633,6 +2875,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
mutex_lock(&ep->com.mutex);
switch (ep->com.state) {
case CONNECTING:
+ c4iw_put_ep(&ep->parent_ep->com);
break;
case MPA_REQ_WAIT:
(void)stop_ep_timer(ep);
@@ -2681,7 +2924,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
case DEAD:
PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
mutex_unlock(&ep->com.mutex);
- return 0;
+ goto deref_ep;
default:
BUG_ON(1);
break;
@@ -2695,10 +2938,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
}
mutex_unlock(&ep->com.mutex);
- rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
- if (!rpl_skb) {
- printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
- __func__);
+ rpl_skb = skb_dequeue(&ep->com.ep_skb_list);
+ if (WARN_ON(!rpl_skb)) {
release = 1;
goto out;
}
@@ -2728,6 +2969,10 @@ out:
c4iw_reconnect(ep);
}
+deref_ep:
+ c4iw_put_ep(&ep->com);
+ /* Dereferencing ep, referenced in peer_abort_intr() */
+ c4iw_put_ep(&ep->com);
return 0;
}
@@ -2737,16 +2982,18 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
struct c4iw_qp_attributes attrs;
struct cpl_close_con_rpl *rpl = cplhdr(skb);
int release = 0;
- struct tid_info *t = dev->rdev.lldi.tids;
unsigned int tid = GET_TID(rpl);
- ep = lookup_tid(t, tid);
+ ep = get_ep_from_tid(dev, tid);
+ if (!ep)
+ return 0;
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
BUG_ON(!ep);
/* The cm_id may be null if we failed to connect */
mutex_lock(&ep->com.mutex);
+ set_bit(CLOSE_CON_RPL, &ep->com.history);
switch (ep->com.state) {
case CLOSING:
__state_set(&ep->com, MORIBUND);
@@ -2774,18 +3021,18 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
mutex_unlock(&ep->com.mutex);
if (release)
release_ep_resources(ep);
+ c4iw_put_ep(&ep->com);
return 0;
}
static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct cpl_rdma_terminate *rpl = cplhdr(skb);
- struct tid_info *t = dev->rdev.lldi.tids;
unsigned int tid = GET_TID(rpl);
struct c4iw_ep *ep;
struct c4iw_qp_attributes attrs;
- ep = lookup_tid(t, tid);
+ ep = get_ep_from_tid(dev, tid);
BUG_ON(!ep);
if (ep && ep->com.qp) {
@@ -2796,6 +3043,7 @@ static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
} else
printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid);
+ c4iw_put_ep(&ep->com);
return 0;
}
@@ -2811,15 +3059,16 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
struct cpl_fw4_ack *hdr = cplhdr(skb);
u8 credits = hdr->credits;
unsigned int tid = GET_TID(hdr);
- struct tid_info *t = dev->rdev.lldi.tids;
- ep = lookup_tid(t, tid);
+ ep = get_ep_from_tid(dev, tid);
+ if (!ep)
+ return 0;
PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits);
if (credits == 0) {
PDBG("%s 0 credit ack ep %p tid %u state %u\n",
__func__, ep, ep->hwtid, state_read(&ep->com));
- return 0;
+ goto out;
}
dst_confirm(ep->dst);
@@ -2827,36 +3076,40 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
PDBG("%s last streaming msg ack ep %p tid %u state %u "
"initiator %u freeing skb\n", __func__, ep, ep->hwtid,
state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0);
+ mutex_lock(&ep->com.mutex);
kfree_skb(ep->mpa_skb);
ep->mpa_skb = NULL;
+ if (test_bit(STOP_MPA_TIMER, &ep->com.flags))
+ stop_ep_timer(ep);
+ mutex_unlock(&ep->com.mutex);
}
+out:
+ c4iw_put_ep(&ep->com);
return 0;
}
int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
{
- int err = 0;
- int disconnect = 0;
+ int abort;
struct c4iw_ep *ep = to_ep(cm_id);
+
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
mutex_lock(&ep->com.mutex);
- if (ep->com.state == DEAD) {
+ if (ep->com.state != MPA_REQ_RCVD) {
mutex_unlock(&ep->com.mutex);
c4iw_put_ep(&ep->com);
return -ECONNRESET;
}
set_bit(ULP_REJECT, &ep->com.history);
- BUG_ON(ep->com.state != MPA_REQ_RCVD);
if (mpa_rev == 0)
- abort_connection(ep, NULL, GFP_KERNEL);
- else {
- err = send_mpa_reject(ep, pdata, pdata_len);
- disconnect = 1;
- }
+ abort = 1;
+ else
+ abort = send_mpa_reject(ep, pdata, pdata_len);
mutex_unlock(&ep->com.mutex);
- if (disconnect)
- err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+
+ stop_ep_timer(ep);
+ c4iw_ep_disconnect(ep, abort != 0, GFP_KERNEL);
c4iw_put_ep(&ep->com);
return 0;
}
@@ -2869,38 +3122,36 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
struct c4iw_ep *ep = to_ep(cm_id);
struct c4iw_dev *h = to_c4iw_dev(cm_id->device);
struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);
+ int abort = 0;
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
mutex_lock(&ep->com.mutex);
- if (ep->com.state == DEAD) {
+ if (ep->com.state != MPA_REQ_RCVD) {
err = -ECONNRESET;
- goto err;
+ goto err_out;
}
- BUG_ON(ep->com.state != MPA_REQ_RCVD);
BUG_ON(!qp);
set_bit(ULP_ACCEPT, &ep->com.history);
if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) ||
(conn_param->ird > cur_max_read_depth(ep->com.dev))) {
- abort_connection(ep, NULL, GFP_KERNEL);
err = -EINVAL;
- goto err;
+ goto err_abort;
}
if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
if (conn_param->ord > ep->ird) {
if (RELAXED_IRD_NEGOTIATION) {
- ep->ord = ep->ird;
+ conn_param->ord = ep->ird;
} else {
ep->ird = conn_param->ird;
ep->ord = conn_param->ord;
send_mpa_reject(ep, conn_param->private_data,
conn_param->private_data_len);
- abort_connection(ep, NULL, GFP_KERNEL);
err = -ENOMEM;
- goto err;
+ goto err_abort;
}
}
if (conn_param->ird < ep->ord) {
@@ -2908,9 +3159,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
ep->ord <= h->rdev.lldi.max_ordird_qp) {
conn_param->ird = ep->ord;
} else {
- abort_connection(ep, NULL, GFP_KERNEL);
err = -ENOMEM;
- goto err;
+ goto err_abort;
}
}
}
@@ -2929,8 +3179,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord);
- cm_id->add_ref(cm_id);
ep->com.cm_id = cm_id;
+ ref_cm_id(&ep->com);
ep->com.qp = qp;
ref_qp(ep);
@@ -2951,23 +3201,27 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
err = c4iw_modify_qp(ep->com.qp->rhp,
ep->com.qp, mask, &attrs, 1);
if (err)
- goto err1;
+ goto err_deref_cm_id;
+
+ set_bit(STOP_MPA_TIMER, &ep->com.flags);
err = send_mpa_reply(ep, conn_param->private_data,
conn_param->private_data_len);
if (err)
- goto err1;
+ goto err_deref_cm_id;
__state_set(&ep->com, FPDU_MODE);
established_upcall(ep);
mutex_unlock(&ep->com.mutex);
c4iw_put_ep(&ep->com);
return 0;
-err1:
- ep->com.cm_id = NULL;
- abort_connection(ep, NULL, GFP_KERNEL);
- cm_id->rem_ref(cm_id);
-err:
+err_deref_cm_id:
+ deref_cm_id(&ep->com);
+err_abort:
+ abort = 1;
+err_out:
mutex_unlock(&ep->com.mutex);
+ if (abort)
+ c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
c4iw_put_ep(&ep->com);
return err;
}
@@ -3056,6 +3310,13 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
err = -ENOMEM;
goto out;
}
+
+ skb_queue_head_init(&ep->com.ep_skb_list);
+ if (alloc_ep_skb_list(&ep->com.ep_skb_list, CN_MAX_CON_BUF)) {
+ err = -ENOMEM;
+ goto fail1;
+ }
+
init_timer(&ep->timer);
ep->plen = conn_param->private_data_len;
if (ep->plen)
@@ -3067,14 +3328,14 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
if (peer2peer && ep->ord == 0)
ep->ord = 1;
- cm_id->add_ref(cm_id);
- ep->com.dev = dev;
ep->com.cm_id = cm_id;
+ ref_cm_id(&ep->com);
+ ep->com.dev = dev;
ep->com.qp = get_qhp(dev, conn_param->qpn);
if (!ep->com.qp) {
PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn);
err = -EINVAL;
- goto fail1;
+ goto fail2;
}
ref_qp(ep);
PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
@@ -3087,7 +3348,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
if (ep->atid == -1) {
printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
err = -ENOMEM;
- goto fail1;
+ goto fail2;
}
insert_handle(dev, &dev->atid_idr, ep, ep->atid);
@@ -3108,10 +3369,10 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
/*
* Handle loopback requests to INADDR_ANY.
*/
- if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) {
+ if (raddr->sin_addr.s_addr == htonl(INADDR_ANY)) {
err = pick_local_ipaddrs(dev, cm_id);
if (err)
- goto fail1;
+ goto fail2;
}
/* find a route */
@@ -3131,7 +3392,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) {
err = pick_local_ip6addrs(dev, cm_id);
if (err)
- goto fail1;
+ goto fail2;
}
/* find a route */
@@ -3147,14 +3408,14 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
if (!ep->dst) {
printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
err = -EHOSTUNREACH;
- goto fail2;
+ goto fail3;
}
err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true,
ep->com.dev->rdev.lldi.adapter_type, cm_id->tos);
if (err) {
printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
- goto fail3;
+ goto fail4;
}
PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
@@ -3170,13 +3431,15 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
goto out;
cxgb4_l2t_release(ep->l2t);
-fail3:
+fail4:
dst_release(ep->dst);
-fail2:
+fail3:
remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+fail2:
+ skb_queue_purge(&ep->com.ep_skb_list);
+ deref_cm_id(&ep->com);
fail1:
- cm_id->rem_ref(cm_id);
c4iw_put_ep(&ep->com);
out:
return err;
@@ -3269,9 +3532,10 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
err = -ENOMEM;
goto fail1;
}
+ skb_queue_head_init(&ep->com.ep_skb_list);
PDBG("%s ep %p\n", __func__, ep);
- cm_id->add_ref(cm_id);
ep->com.cm_id = cm_id;
+ ref_cm_id(&ep->com);
ep->com.dev = dev;
ep->backlog = backlog;
memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
@@ -3311,7 +3575,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
ep->com.local_addr.ss_family);
fail2:
- cm_id->rem_ref(cm_id);
+ deref_cm_id(&ep->com);
c4iw_put_ep(&ep->com);
fail1:
out:
@@ -3350,7 +3614,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
ep->com.local_addr.ss_family);
done:
- cm_id->rem_ref(cm_id);
+ deref_cm_id(&ep->com);
c4iw_put_ep(&ep->com);
return err;
}
@@ -3367,6 +3631,12 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
states[ep->com.state], abrupt);
+ /*
+ * Ref the ep here in case we have fatal errors causing the
+ * ep to be released and freed.
+ */
+ c4iw_get_ep(&ep->com);
+
rdev = &ep->com.dev->rdev;
if (c4iw_fatal_error(rdev)) {
fatal = 1;
@@ -3379,11 +3649,22 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
case MPA_REQ_RCVD:
case MPA_REP_SENT:
case FPDU_MODE:
+ case CONNECTING:
close = 1;
if (abrupt)
ep->com.state = ABORTING;
else {
ep->com.state = CLOSING;
+
+ /*
+ * if we close before we see the fw4_ack() then we fix
+ * up the timer state since we're reusing it.
+ */
+ if (ep->mpa_skb &&
+ test_bit(STOP_MPA_TIMER, &ep->com.flags)) {
+ clear_bit(STOP_MPA_TIMER, &ep->com.flags);
+ stop_ep_timer(ep);
+ }
start_ep_timer(ep);
}
set_bit(CLOSE_SENT, &ep->com.flags);
@@ -3413,15 +3694,35 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
if (abrupt) {
set_bit(EP_DISC_ABORT, &ep->com.history);
close_complete_upcall(ep, -ECONNRESET);
- ret = send_abort(ep, NULL, gfp);
+ ret = send_abort(ep);
} else {
set_bit(EP_DISC_CLOSE, &ep->com.history);
- ret = send_halfclose(ep, gfp);
+ ret = send_halfclose(ep);
}
- if (ret)
+ if (ret) {
+ set_bit(EP_DISC_FAIL, &ep->com.history);
+ if (!abrupt) {
+ stop_ep_timer(ep);
+ close_complete_upcall(ep, -EIO);
+ }
+ if (ep->com.qp) {
+ struct c4iw_qp_attributes attrs;
+
+ attrs.next_state = C4IW_QP_STATE_ERROR;
+ ret = c4iw_modify_qp(ep->com.qp->rhp,
+ ep->com.qp,
+ C4IW_QP_ATTR_NEXT_STATE,
+ &attrs, 1);
+ if (ret)
+ pr_err(MOD
+ "%s - qp <- error failed!\n",
+ __func__);
+ }
fatal = 1;
+ }
}
mutex_unlock(&ep->com.mutex);
+ c4iw_put_ep(&ep->com);
if (fatal)
release_ep_resources(ep);
return ret;
@@ -3676,7 +3977,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
struct cpl_pass_accept_req *req = (void *)(rss + 1);
struct l2t_entry *e;
struct dst_entry *dst;
- struct c4iw_ep *lep;
+ struct c4iw_ep *lep = NULL;
u16 window;
struct port_info *pi;
struct net_device *pdev;
@@ -3701,7 +4002,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
*/
stid = (__force int) cpu_to_be32((__force u32) rss->hash_val);
- lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid);
+ lep = (struct c4iw_ep *)get_ep_from_stid(dev, stid);
if (!lep) {
PDBG("%s connect request on invalid stid %d\n", __func__, stid);
goto reject;
@@ -3802,6 +4103,8 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
free_dst:
dst_release(dst);
reject:
+ if (lep)
+ c4iw_put_ep(&lep->com);
return 0;
}
@@ -3809,7 +4112,7 @@ reject:
* These are the real handlers that are called from a
* work queue.
*/
-static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
+static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = {
[CPL_ACT_ESTABLISH] = act_establish,
[CPL_ACT_OPEN_RPL] = act_open_rpl,
[CPL_RX_DATA] = rx_data,
@@ -3825,7 +4128,9 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
[CPL_RDMA_TERMINATE] = terminate,
[CPL_FW4_ACK] = fw4_ack,
[CPL_FW6_MSG] = deferred_fw6_msg,
- [CPL_RX_PKT] = rx_pkt
+ [CPL_RX_PKT] = rx_pkt,
+ [FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe,
+ [FAKE_CPL_PASS_PUT_EP_SAFE] = _put_pass_ep_safe
};
static void process_timeout(struct c4iw_ep *ep)
@@ -3839,11 +4144,12 @@ static void process_timeout(struct c4iw_ep *ep)
set_bit(TIMEDOUT, &ep->com.history);
switch (ep->com.state) {
case MPA_REQ_SENT:
- __state_set(&ep->com, ABORTING);
connect_reply_upcall(ep, -ETIMEDOUT);
break;
case MPA_REQ_WAIT:
- __state_set(&ep->com, ABORTING);
+ case MPA_REQ_RCVD:
+ case MPA_REP_SENT:
+ case FPDU_MODE:
break;
case CLOSING:
case MORIBUND:
@@ -3853,7 +4159,6 @@ static void process_timeout(struct c4iw_ep *ep)
ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
&attrs, 1);
}
- __state_set(&ep->com, ABORTING);
close_complete_upcall(ep, -ETIMEDOUT);
break;
case ABORTING:
@@ -3871,9 +4176,9 @@ static void process_timeout(struct c4iw_ep *ep)
__func__, ep, ep->hwtid, ep->com.state);
abort = 0;
}
- if (abort)
- abort_connection(ep, NULL, GFP_KERNEL);
mutex_unlock(&ep->com.mutex);
+ if (abort)
+ c4iw_ep_disconnect(ep, 1, GFP_KERNEL);
c4iw_put_ep(&ep->com);
}
@@ -4006,10 +4311,10 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb)
{
struct cpl_abort_req_rss *req = cplhdr(skb);
struct c4iw_ep *ep;
- struct tid_info *t = dev->rdev.lldi.tids;
unsigned int tid = GET_TID(req);
- ep = lookup_tid(t, tid);
+ ep = get_ep_from_tid(dev, tid);
+ /* This EP will be dereferenced in peer_abort() */
if (!ep) {
printk(KERN_WARNING MOD
"Abort on non-existent endpoint, tid %d\n", tid);
@@ -4020,24 +4325,13 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb)
PDBG("%s Negative advice on abort- tid %u status %d (%s)\n",
__func__, ep->hwtid, req->status,
neg_adv_str(req->status));
- ep->stats.abort_neg_adv++;
- dev->rdev.stats.neg_adv++;
- kfree_skb(skb);
- return 0;
+ goto out;
}
PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
ep->com.state);
- /*
- * Wake up any threads in rdma_init() or rdma_fini().
- * However, if we are on MPAv2 and want to retry with MPAv1
- * then, don't wake up yet.
- */
- if (mpa_rev == 2 && !ep->tried_with_mpa_v1) {
- if (ep->com.state != MPA_REQ_SENT)
- c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
- } else
- c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+ c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+out:
sched(dev, skb);
return 0;
}
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index b0b955724458..ac926c942fee 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -33,19 +33,15 @@
#include "iw_cxgb4.h"
static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
- struct c4iw_dev_ucontext *uctx)
+ struct c4iw_dev_ucontext *uctx, struct sk_buff *skb)
{
struct fw_ri_res_wr *res_wr;
struct fw_ri_res *res;
int wr_len;
struct c4iw_wr_wait wr_wait;
- struct sk_buff *skb;
int ret;
wr_len = sizeof *res_wr + sizeof *res;
- skb = alloc_skb(wr_len, GFP_KERNEL);
- if (!skb)
- return -ENOMEM;
set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
@@ -863,7 +859,9 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq)
ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
: NULL;
destroy_cq(&chp->rhp->rdev, &chp->cq,
- ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx);
+ ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
+ chp->destroy_skb);
+ chp->destroy_skb = NULL;
kfree(chp);
return 0;
}
@@ -879,7 +877,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
struct c4iw_cq *chp;
struct c4iw_create_cq_resp uresp;
struct c4iw_ucontext *ucontext = NULL;
- int ret;
+ int ret, wr_len;
size_t memsize, hwentries;
struct c4iw_mm_entry *mm, *mm2;
@@ -896,6 +894,13 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
if (!chp)
return ERR_PTR(-ENOMEM);
+ wr_len = sizeof(struct fw_ri_res_wr) + sizeof(struct fw_ri_res);
+ chp->destroy_skb = alloc_skb(wr_len, GFP_KERNEL);
+ if (!chp->destroy_skb) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
if (ib_context)
ucontext = to_c4iw_ucontext(ib_context);
@@ -936,7 +941,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
ret = create_cq(&rhp->rdev, &chp->cq,
ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
if (ret)
- goto err1;
+ goto err2;
chp->rhp = rhp;
chp->cq.size--; /* status page */
@@ -947,15 +952,15 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
init_waitqueue_head(&chp->wait);
ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
if (ret)
- goto err2;
+ goto err3;
if (ucontext) {
mm = kmalloc(sizeof *mm, GFP_KERNEL);
if (!mm)
- goto err3;
+ goto err4;
mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
if (!mm2)
- goto err4;
+ goto err5;
uresp.qid_mask = rhp->rdev.cqmask;
uresp.cqid = chp->cq.cqid;
@@ -970,7 +975,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
ret = ib_copy_to_udata(udata, &uresp,
sizeof(uresp) - sizeof(uresp.reserved));
if (ret)
- goto err5;
+ goto err6;
mm->key = uresp.key;
mm->addr = virt_to_phys(chp->cq.queue);
@@ -986,15 +991,18 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
__func__, chp->cq.cqid, chp, chp->cq.size,
chp->cq.memsize, (unsigned long long) chp->cq.dma_addr);
return &chp->ibcq;
-err5:
+err6:
kfree(mm2);
-err4:
+err5:
kfree(mm);
-err3:
+err4:
remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
-err2:
+err3:
destroy_cq(&chp->rhp->rdev, &chp->cq,
- ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+ ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
+ chp->destroy_skb);
+err2:
+ kfree_skb(chp->destroy_skb);
err1:
kfree(chp);
return ERR_PTR(ret);
@@ -1008,15 +1016,15 @@ int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
{
struct c4iw_cq *chp;
- int ret;
+ int ret = 0;
unsigned long flag;
chp = to_c4iw_cq(ibcq);
spin_lock_irqsave(&chp->lock, flag);
- ret = t4_arm_cq(&chp->cq,
- (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
+ t4_arm_cq(&chp->cq,
+ (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
+ if (flags & IB_CQ_REPORT_MISSED_EVENTS)
+ ret = t4_cq_notempty(&chp->cq);
spin_unlock_irqrestore(&chp->lock, flag);
- if (ret && !(flags & IB_CQ_REPORT_MISSED_EVENTS))
- ret = 0;
return ret;
}
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index ae2e8b23d2dd..3c4b2126e0d1 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -317,7 +317,7 @@ static int qp_open(struct inode *inode, struct file *file)
idr_for_each(&qpd->devp->qpidr, count_idrs, &count);
spin_unlock_irq(&qpd->devp->lock);
- qpd->bufsize = count * 128;
+ qpd->bufsize = count * 180;
qpd->buf = vmalloc(qpd->bufsize);
if (!qpd->buf) {
kfree(qpd);
@@ -872,9 +872,13 @@ static void c4iw_rdev_close(struct c4iw_rdev *rdev)
static void c4iw_dealloc(struct uld_ctx *ctx)
{
c4iw_rdev_close(&ctx->dev->rdev);
+ WARN_ON_ONCE(!idr_is_empty(&ctx->dev->cqidr));
idr_destroy(&ctx->dev->cqidr);
+ WARN_ON_ONCE(!idr_is_empty(&ctx->dev->qpidr));
idr_destroy(&ctx->dev->qpidr);
+ WARN_ON_ONCE(!idr_is_empty(&ctx->dev->mmidr));
idr_destroy(&ctx->dev->mmidr);
+ wait_event(ctx->dev->wait, idr_is_empty(&ctx->dev->hwtid_idr));
idr_destroy(&ctx->dev->hwtid_idr);
idr_destroy(&ctx->dev->stid_idr);
idr_destroy(&ctx->dev->atid_idr);
@@ -992,6 +996,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
mutex_init(&devp->rdev.stats.lock);
mutex_init(&devp->db_mutex);
INIT_LIST_HEAD(&devp->db_fc_list);
+ init_waitqueue_head(&devp->wait);
devp->avail_ird = devp->rdev.lldi.max_ird_adapter;
if (c4iw_debugfs_root) {
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index df43f871ab61..4b83b84f7ddf 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -263,6 +263,7 @@ struct c4iw_dev {
struct idr stid_idr;
struct list_head db_fc_list;
u32 avail_ird;
+ wait_queue_head_t wait;
};
static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
@@ -384,6 +385,7 @@ struct c4iw_mr {
struct ib_mr ibmr;
struct ib_umem *umem;
struct c4iw_dev *rhp;
+ struct sk_buff *dereg_skb;
u64 kva;
struct tpt_attributes attr;
u64 *mpl;
@@ -400,6 +402,7 @@ static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr)
struct c4iw_mw {
struct ib_mw ibmw;
struct c4iw_dev *rhp;
+ struct sk_buff *dereg_skb;
u64 kva;
struct tpt_attributes attr;
};
@@ -412,6 +415,7 @@ static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw)
struct c4iw_cq {
struct ib_cq ibcq;
struct c4iw_dev *rhp;
+ struct sk_buff *destroy_skb;
struct t4_cq cq;
spinlock_t lock;
spinlock_t comp_handler_lock;
@@ -472,7 +476,7 @@ struct c4iw_qp {
struct t4_wq wq;
spinlock_t lock;
struct mutex mutex;
- atomic_t refcnt;
+ struct kref kref;
wait_queue_head_t wait;
struct timer_list timer;
int sq_sig_all;
@@ -755,6 +759,7 @@ enum c4iw_ep_flags {
CLOSE_SENT = 3,
TIMEOUT = 4,
QP_REFERENCED = 5,
+ STOP_MPA_TIMER = 7,
};
enum c4iw_ep_history {
@@ -779,13 +784,38 @@ enum c4iw_ep_history {
EP_DISC_ABORT = 18,
CONN_RPL_UPCALL = 19,
ACT_RETRY_NOMEM = 20,
- ACT_RETRY_INUSE = 21
+ ACT_RETRY_INUSE = 21,
+ CLOSE_CON_RPL = 22,
+ EP_DISC_FAIL = 24,
+ QP_REFED = 25,
+ QP_DEREFED = 26,
+ CM_ID_REFED = 27,
+ CM_ID_DEREFED = 28,
+};
+
+enum conn_pre_alloc_buffers {
+ CN_ABORT_REQ_BUF,
+ CN_ABORT_RPL_BUF,
+ CN_CLOSE_CON_REQ_BUF,
+ CN_DESTROY_BUF,
+ CN_FLOWC_BUF,
+ CN_MAX_CON_BUF
+};
+
+#define FLOWC_LEN 80
+union cpl_wr_size {
+ struct cpl_abort_req abrt_req;
+ struct cpl_abort_rpl abrt_rpl;
+ struct fw_ri_wr ri_req;
+ struct cpl_close_con_req close_req;
+ char flowc_buf[FLOWC_LEN];
};
struct c4iw_ep_common {
struct iw_cm_id *cm_id;
struct c4iw_qp *qp;
struct c4iw_dev *dev;
+ struct sk_buff_head ep_skb_list;
enum c4iw_ep_state state;
struct kref kref;
struct mutex mutex;
@@ -917,9 +947,8 @@ void c4iw_qp_rem_ref(struct ib_qp *qp);
struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
enum ib_mr_type mr_type,
u32 max_num_sg);
-int c4iw_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents);
+int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset);
int c4iw_dealloc_mw(struct ib_mw *mw);
struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 008be07d5604..0b91b0f4df71 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -59,9 +59,9 @@ static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length)
}
static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
- u32 len, dma_addr_t data, int wait)
+ u32 len, dma_addr_t data,
+ int wait, struct sk_buff *skb)
{
- struct sk_buff *skb;
struct ulp_mem_io *req;
struct ulptx_sgl *sgl;
u8 wr_len;
@@ -74,9 +74,11 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
c4iw_init_wr_wait(&wr_wait);
wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16);
- skb = alloc_skb(wr_len, GFP_KERNEL);
- if (!skb)
- return -ENOMEM;
+ if (!skb) {
+ skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+ if (!skb)
+ return -ENOMEM;
+ }
set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
@@ -86,8 +88,9 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
(wait ? FW_WR_COMPL_F : 0));
req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L;
req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16)));
- req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE));
- req->cmd |= cpu_to_be32(T5_ULP_MEMIO_ORDER_V(1));
+ req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE) |
+ T5_ULP_MEMIO_ORDER_V(1) |
+ T5_ULP_MEMIO_FID_V(rdev->lldi.rxq_ids[0]));
req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(len>>5));
req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16));
req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr));
@@ -107,9 +110,8 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
}
static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
- void *data)
+ void *data, struct sk_buff *skb)
{
- struct sk_buff *skb;
struct ulp_mem_io *req;
struct ulptx_idata *sc;
u8 wr_len, *to_dp, *from_dp;
@@ -133,9 +135,11 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
wr_len = roundup(sizeof *req + sizeof *sc +
roundup(copy_len, T4_ULPTX_MIN_IO), 16);
- skb = alloc_skb(wr_len, GFP_KERNEL);
- if (!skb)
- return -ENOMEM;
+ if (!skb) {
+ skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+ if (!skb)
+ return -ENOMEM;
+ }
set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
@@ -172,6 +176,7 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO -
(copy_len % T4_ULPTX_MIN_IO));
ret = c4iw_ofld_send(rdev, skb);
+ skb = NULL;
if (ret)
return ret;
len -= C4IW_MAX_INLINE_SIZE;
@@ -181,7 +186,8 @@ static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
return ret;
}
-static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data)
+static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len,
+ void *data, struct sk_buff *skb)
{
u32 remain = len;
u32 dmalen;
@@ -204,7 +210,7 @@ static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *
dmalen = T4_ULPTX_MAX_DMA;
remain -= dmalen;
ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr,
- !remain);
+ !remain, skb);
if (ret)
goto out;
addr += dmalen >> 5;
@@ -212,7 +218,7 @@ static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *
daddr += dmalen;
}
if (remain)
- ret = _c4iw_write_mem_inline(rdev, addr, remain, data);
+ ret = _c4iw_write_mem_inline(rdev, addr, remain, data, skb);
out:
dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE);
return ret;
@@ -223,23 +229,25 @@ out:
* If data is NULL, clear len byte of memory to zero.
*/
static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len,
- void *data)
+ void *data, struct sk_buff *skb)
{
if (is_t5(rdev->lldi.adapter_type) && use_dsgl) {
if (len > inline_threshold) {
- if (_c4iw_write_mem_dma(rdev, addr, len, data)) {
+ if (_c4iw_write_mem_dma(rdev, addr, len, data, skb)) {
printk_ratelimited(KERN_WARNING
"%s: dma map"
" failure (non fatal)\n",
pci_name(rdev->lldi.pdev));
return _c4iw_write_mem_inline(rdev, addr, len,
- data);
- } else
+ data, skb);
+ } else {
return 0;
+ }
} else
- return _c4iw_write_mem_inline(rdev, addr, len, data);
+ return _c4iw_write_mem_inline(rdev, addr,
+ len, data, skb);
} else
- return _c4iw_write_mem_inline(rdev, addr, len, data);
+ return _c4iw_write_mem_inline(rdev, addr, len, data, skb);
}
/*
@@ -252,7 +260,8 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
u32 *stag, u8 stag_state, u32 pdid,
enum fw_ri_stag_type type, enum fw_ri_mem_perms perm,
int bind_enabled, u32 zbva, u64 to,
- u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr)
+ u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr,
+ struct sk_buff *skb)
{
int err;
struct fw_ri_tpte tpt;
@@ -306,7 +315,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
}
err = write_adapter_mem(rdev, stag_idx +
(rdev->lldi.vr->stag.start >> 5),
- sizeof(tpt), &tpt);
+ sizeof(tpt), &tpt, skb);
if (reset_tpt_entry) {
c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
@@ -326,28 +335,29 @@ static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
__func__, pbl_addr, rdev->lldi.vr->pbl.start,
pbl_size);
- err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl);
+ err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl, NULL);
return err;
}
static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size,
- u32 pbl_addr)
+ u32 pbl_addr, struct sk_buff *skb)
{
return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0,
- pbl_size, pbl_addr);
+ pbl_size, pbl_addr, skb);
}
static int allocate_window(struct c4iw_rdev *rdev, u32 * stag, u32 pdid)
{
*stag = T4_STAG_UNSET;
return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0,
- 0UL, 0, 0, 0, 0);
+ 0UL, 0, 0, 0, 0, NULL);
}
-static int deallocate_window(struct c4iw_rdev *rdev, u32 stag)
+static int deallocate_window(struct c4iw_rdev *rdev, u32 stag,
+ struct sk_buff *skb)
{
return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0,
- 0);
+ 0, skb);
}
static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
@@ -355,7 +365,7 @@ static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
{
*stag = T4_STAG_UNSET;
return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0,
- 0UL, 0, 0, pbl_size, pbl_addr);
+ 0UL, 0, 0, pbl_size, pbl_addr, NULL);
}
static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag)
@@ -382,14 +392,16 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
mhp->attr.mw_bind_enable, mhp->attr.zbva,
mhp->attr.va_fbo, mhp->attr.len ?
mhp->attr.len : -1, shift - 12,
- mhp->attr.pbl_size, mhp->attr.pbl_addr);
+ mhp->attr.pbl_size, mhp->attr.pbl_addr, NULL);
if (ret)
return ret;
ret = finish_mem_reg(mhp, stag);
- if (ret)
+ if (ret) {
dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
- mhp->attr.pbl_addr);
+ mhp->attr.pbl_addr, mhp->dereg_skb);
+ mhp->dereg_skb = NULL;
+ }
return ret;
}
@@ -422,6 +434,12 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
if (!mhp)
return ERR_PTR(-ENOMEM);
+ mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+ if (!mhp->dereg_skb) {
+ ret = -ENOMEM;
+ goto err0;
+ }
+
mhp->rhp = rhp;
mhp->attr.pdid = php->pdid;
mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
@@ -434,7 +452,8 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid,
FW_RI_STAG_NSMR, mhp->attr.perms,
- mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0);
+ mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0,
+ NULL);
if (ret)
goto err1;
@@ -444,8 +463,10 @@ struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
return &mhp->ibmr;
err2:
dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
- mhp->attr.pbl_addr);
+ mhp->attr.pbl_addr, mhp->dereg_skb);
err1:
+ kfree_skb(mhp->dereg_skb);
+err0:
kfree(mhp);
return ERR_PTR(ret);
}
@@ -480,11 +501,18 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mhp)
return ERR_PTR(-ENOMEM);
+ mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+ if (!mhp->dereg_skb) {
+ kfree(mhp);
+ return ERR_PTR(-ENOMEM);
+ }
+
mhp->rhp = rhp;
mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
if (IS_ERR(mhp->umem)) {
err = PTR_ERR(mhp->umem);
+ kfree_skb(mhp->dereg_skb);
kfree(mhp);
return ERR_PTR(err);
}
@@ -549,6 +577,7 @@ err_pbl:
err:
ib_umem_release(mhp->umem);
+ kfree_skb(mhp->dereg_skb);
kfree(mhp);
return ERR_PTR(err);
}
@@ -571,11 +600,16 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
if (!mhp)
return ERR_PTR(-ENOMEM);
- ret = allocate_window(&rhp->rdev, &stag, php->pdid);
- if (ret) {
- kfree(mhp);
- return ERR_PTR(ret);
+
+ mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
+ if (!mhp->dereg_skb) {
+ ret = -ENOMEM;
+ goto free_mhp;
}
+
+ ret = allocate_window(&rhp->rdev, &stag, php->pdid);
+ if (ret)
+ goto free_skb;
mhp->rhp = rhp;
mhp->attr.pdid = php->pdid;
mhp->attr.type = FW_RI_STAG_MW;
@@ -583,12 +617,19 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
mmid = (stag) >> 8;
mhp->ibmw.rkey = stag;
if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
- deallocate_window(&rhp->rdev, mhp->attr.stag);
- kfree(mhp);
- return ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
+ goto dealloc_win;
}
PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
return &(mhp->ibmw);
+
+dealloc_win:
+ deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb);
+free_skb:
+ kfree_skb(mhp->dereg_skb);
+free_mhp:
+ kfree(mhp);
+ return ERR_PTR(ret);
}
int c4iw_dealloc_mw(struct ib_mw *mw)
@@ -601,7 +642,8 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
rhp = mhp->rhp;
mmid = (mw->rkey) >> 8;
remove_handle(rhp, &rhp->mmidr, mmid);
- deallocate_window(&rhp->rdev, mhp->attr.stag);
+ deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb);
+ kfree_skb(mhp->dereg_skb);
kfree(mhp);
PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
return 0;
@@ -665,7 +707,7 @@ struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
return &(mhp->ibmr);
err3:
dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
- mhp->attr.pbl_addr);
+ mhp->attr.pbl_addr, mhp->dereg_skb);
err2:
c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
mhp->attr.pbl_size << 3);
@@ -690,15 +732,14 @@ static int c4iw_set_page(struct ib_mr *ibmr, u64 addr)
return 0;
}
-int c4iw_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents)
+int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset)
{
struct c4iw_mr *mhp = to_c4iw_mr(ibmr);
mhp->mpl_len = 0;
- return ib_sg_to_pages(ibmr, sg, sg_nents, c4iw_set_page);
+ return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, c4iw_set_page);
}
int c4iw_dereg_mr(struct ib_mr *ib_mr)
@@ -717,7 +758,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr)
dma_free_coherent(&mhp->rhp->rdev.lldi.pdev->dev,
mhp->max_mpl_len, mhp->mpl, mhp->mpl_addr);
dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
- mhp->attr.pbl_addr);
+ mhp->attr.pbl_addr, mhp->dereg_skb);
if (mhp->attr.pbl_size)
c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
mhp->attr.pbl_size << 3);
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 7574f394fdac..df127ce6b6ec 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -409,20 +409,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
}
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
- ibdev.dev);
- PDBG("%s dev 0x%p\n", __func__, dev);
-
- return sprintf(buf, "%u.%u.%u.%u\n",
- FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
- FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
- FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
- FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
-}
-
static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -446,30 +432,67 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
c4iw_dev->rdev.lldi.pdev->device);
}
+enum counters {
+ IP4INSEGS,
+ IP4OUTSEGS,
+ IP4RETRANSSEGS,
+ IP4OUTRSTS,
+ IP6INSEGS,
+ IP6OUTSEGS,
+ IP6RETRANSSEGS,
+ IP6OUTRSTS,
+ NR_COUNTERS
+};
+
+static const char * const names[] = {
+ [IP4INSEGS] = "ip4InSegs",
+ [IP4OUTSEGS] = "ip4OutSegs",
+ [IP4RETRANSSEGS] = "ip4RetransSegs",
+ [IP4OUTRSTS] = "ip4OutRsts",
+ [IP6INSEGS] = "ip6InSegs",
+ [IP6OUTSEGS] = "ip6OutSegs",
+ [IP6RETRANSSEGS] = "ip6RetransSegs",
+ [IP6OUTRSTS] = "ip6OutRsts"
+};
+
+static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev,
+ u8 port_num)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
+
+ if (port_num != 0)
+ return NULL;
+
+ return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
+ RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
static int c4iw_get_mib(struct ib_device *ibdev,
- union rdma_protocol_stats *stats)
+ struct rdma_hw_stats *stats,
+ u8 port, int index)
{
struct tp_tcp_stats v4, v6;
struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev);
cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6);
- memset(stats, 0, sizeof *stats);
- stats->iw.tcpInSegs = v4.tcp_in_segs + v6.tcp_in_segs;
- stats->iw.tcpOutSegs = v4.tcp_out_segs + v6.tcp_out_segs;
- stats->iw.tcpRetransSegs = v4.tcp_retrans_segs + v6.tcp_retrans_segs;
- stats->iw.tcpOutRsts = v4.tcp_out_rsts + v6.tcp_out_rsts;
-
- return 0;
+ stats->value[IP4INSEGS] = v4.tcp_in_segs;
+ stats->value[IP4OUTSEGS] = v4.tcp_out_segs;
+ stats->value[IP4RETRANSSEGS] = v4.tcp_retrans_segs;
+ stats->value[IP4OUTRSTS] = v4.tcp_out_rsts;
+ stats->value[IP6INSEGS] = v6.tcp_in_segs;
+ stats->value[IP6OUTSEGS] = v6.tcp_out_segs;
+ stats->value[IP6RETRANSSEGS] = v6.tcp_retrans_segs;
+ stats->value[IP6OUTRSTS] = v6.tcp_out_rsts;
+
+ return stats->num_counters;
}
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
static struct device_attribute *c4iw_class_attributes[] = {
&dev_attr_hw_rev,
- &dev_attr_fw_ver,
&dev_attr_hca_type,
&dev_attr_board_id,
};
@@ -491,6 +514,20 @@ static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
return 0;
}
+static void get_dev_fw_str(struct ib_device *dev, char *str,
+ size_t str_len)
+{
+ struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+ ibdev);
+ PDBG("%s dev 0x%p\n", __func__, dev);
+
+ snprintf(str, str_len, "%u.%u.%u.%u",
+ FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
+ FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
+ FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
+ FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
+}
+
int c4iw_register_device(struct c4iw_dev *dev)
{
int ret;
@@ -562,9 +599,11 @@ int c4iw_register_device(struct c4iw_dev *dev)
dev->ibdev.req_notify_cq = c4iw_arm_cq;
dev->ibdev.post_send = c4iw_post_send;
dev->ibdev.post_recv = c4iw_post_receive;
- dev->ibdev.get_protocol_stats = c4iw_get_mib;
+ dev->ibdev.alloc_hw_stats = c4iw_alloc_stats;
+ dev->ibdev.get_hw_stats = c4iw_get_mib;
dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
dev->ibdev.get_port_immutable = c4iw_port_immutable;
+ dev->ibdev.get_dev_fw_str = get_dev_fw_str;
dev->ibdev.drain_sq = c4iw_drain_sq;
dev->ibdev.drain_rq = c4iw_drain_rq;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index e8993e49b8b3..690435229be7 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -683,17 +683,25 @@ static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr,
return 0;
}
+static void _free_qp(struct kref *kref)
+{
+ struct c4iw_qp *qhp;
+
+ qhp = container_of(kref, struct c4iw_qp, kref);
+ PDBG("%s qhp %p\n", __func__, qhp);
+ kfree(qhp);
+}
+
void c4iw_qp_add_ref(struct ib_qp *qp)
{
PDBG("%s ib_qp %p\n", __func__, qp);
- atomic_inc(&(to_c4iw_qp(qp)->refcnt));
+ kref_get(&to_c4iw_qp(qp)->kref);
}
void c4iw_qp_rem_ref(struct ib_qp *qp)
{
PDBG("%s ib_qp %p\n", __func__, qp);
- if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt)))
- wake_up(&(to_c4iw_qp(qp)->wait));
+ kref_put(&to_c4iw_qp(qp)->kref, _free_qp);
}
static void add_to_fc_list(struct list_head *head, struct list_head *entry)
@@ -1081,9 +1089,10 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe,
PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
qhp->ep->hwtid);
- skb = alloc_skb(sizeof *wqe, gfp);
- if (!skb)
+ skb = skb_dequeue(&qhp->ep->com.ep_skb_list);
+ if (WARN_ON(!skb))
return;
+
set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
@@ -1202,9 +1211,10 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
ep->hwtid);
- skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
- if (!skb)
+ skb = skb_dequeue(&ep->com.ep_skb_list);
+ if (WARN_ON(!skb))
return -ENOMEM;
+
set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
@@ -1592,8 +1602,6 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
wait_event(qhp->wait, !qhp->ep);
remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
- atomic_dec(&qhp->refcnt);
- wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
spin_lock_irq(&rhp->lock);
if (!list_empty(&qhp->db_fc_entry))
@@ -1606,8 +1614,9 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
destroy_qp(&rhp->rdev, &qhp->wq,
ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+ c4iw_qp_rem_ref(ib_qp);
+
PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid);
- kfree(qhp);
return 0;
}
@@ -1704,7 +1713,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
init_completion(&qhp->rq_drained);
mutex_init(&qhp->mutex);
init_waitqueue_head(&qhp->wait);
- atomic_set(&qhp->refcnt, 1);
+ kref_init(&qhp->kref);
ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
if (ret)
@@ -1896,12 +1905,20 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
return 0;
}
+static void move_qp_to_err(struct c4iw_qp *qp)
+{
+ struct c4iw_qp_attributes attrs = { .next_state = C4IW_QP_STATE_ERROR };
+
+ (void)c4iw_modify_qp(qp->rhp, qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+}
+
void c4iw_drain_sq(struct ib_qp *ibqp)
{
struct c4iw_qp *qp = to_c4iw_qp(ibqp);
unsigned long flag;
bool need_to_wait;
+ move_qp_to_err(qp);
spin_lock_irqsave(&qp->lock, flag);
need_to_wait = !t4_sq_empty(&qp->wq);
spin_unlock_irqrestore(&qp->lock, flag);
@@ -1916,6 +1933,7 @@ void c4iw_drain_rq(struct ib_qp *ibqp)
unsigned long flag;
bool need_to_wait;
+ move_qp_to_err(qp);
spin_lock_irqsave(&qp->lock, flag);
need_to_wait = !t4_rq_empty(&qp->wq);
spin_unlock_irqrestore(&qp->lock, flag);
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index 6126bbe36095..02173f4315fa 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -634,6 +634,11 @@ static inline int t4_valid_cqe(struct t4_cq *cq, struct t4_cqe *cqe)
return (CQE_GENBIT(cqe) == cq->gen);
}
+static inline int t4_cq_notempty(struct t4_cq *cq)
+{
+ return cq->sw_in_use || t4_valid_cqe(cq, &cq->queue[cq->cidx]);
+}
+
static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
{
int ret;
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
new file mode 100644
index 000000000000..f6ea0881765a
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/Kconfig
@@ -0,0 +1,29 @@
+config INFINIBAND_HFI1
+ tristate "Intel OPA Gen1 support"
+ depends on X86_64 && INFINIBAND_RDMAVT && I2C
+ select MMU_NOTIFIER
+ select CRC32
+ select I2C_ALGOBIT
+ ---help---
+ This is a low-level driver for Intel OPA Gen1 adapter.
+config HFI1_DEBUG_SDMA_ORDER
+ bool "HFI1 SDMA Order debug"
+ depends on INFINIBAND_HFI1
+ default n
+ ---help---
+ This is a debug flag to test for out of order
+ sdma completions for unit testing
+config HFI1_VERBS_31BIT_PSN
+ bool "HFI1 enable 31 bit PSN"
+ depends on INFINIBAND_HFI1
+ default y
+ ---help---
+ Setting this enables 31 BIT PSN
+ For verbs RC/UC
+config SDMA_VERBOSITY
+ bool "Config SDMA Verbosity"
+ depends on INFINIBAND_HFI1
+ default n
+ ---help---
+ This is a configuration flag to enable verbose
+ SDMA debug
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
new file mode 100644
index 000000000000..0cf97a09b64b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -0,0 +1,21 @@
+#
+# HFI driver
+#
+#
+#
+# Called from the kernel module build system.
+#
+obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
+
+hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
+ eprom.o file_ops.o firmware.o \
+ init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
+ qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
+ uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
+ verbs_txreq.o
+hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
+
+CFLAGS_trace.o = -I$(src)
+ifdef MVERSION
+CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
+endif
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
new file mode 100644
index 000000000000..0566393e5aba
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -0,0 +1,748 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "affinity.h"
+#include "sdma.h"
+#include "trace.h"
+
+struct hfi1_affinity_node_list node_affinity = {
+ .list = LIST_HEAD_INIT(node_affinity.list),
+ .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock),
+};
+
+/* Name of IRQ types, indexed by enum irq_type */
+static const char * const irq_type_names[] = {
+ "SDMA",
+ "RCVCTXT",
+ "GENERAL",
+ "OTHER",
+};
+
+/* Per NUMA node count of HFI devices */
+static unsigned int *hfi1_per_node_cntr;
+
+static inline void init_cpu_mask_set(struct cpu_mask_set *set)
+{
+ cpumask_clear(&set->mask);
+ cpumask_clear(&set->used);
+ set->gen = 0;
+}
+
+/* Initialize non-HT cpu cores mask */
+void init_real_cpu_mask(void)
+{
+ int possible, curr_cpu, i, ht;
+
+ cpumask_clear(&node_affinity.real_cpu_mask);
+
+ /* Start with cpu online mask as the real cpu mask */
+ cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
+
+ /*
+ * Remove HT cores from the real cpu mask. Do this in two steps below.
+ */
+ possible = cpumask_weight(&node_affinity.real_cpu_mask);
+ ht = cpumask_weight(topology_sibling_cpumask(
+ cpumask_first(&node_affinity.real_cpu_mask)));
+ /*
+ * Step 1. Skip over the first N HT siblings and use them as the
+ * "real" cores. Assumes that HT cores are not enumerated in
+ * succession (except in the single core case).
+ */
+ curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
+ for (i = 0; i < possible / ht; i++)
+ curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
+ /*
+ * Step 2. Remove the remaining HT siblings. Use cpumask_next() to
+ * skip any gaps.
+ */
+ for (; i < possible; i++) {
+ cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
+ curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
+ }
+}
+
+int node_affinity_init(void)
+{
+ int node;
+ struct pci_dev *dev = NULL;
+ const struct pci_device_id *ids = hfi1_pci_tbl;
+
+ cpumask_clear(&node_affinity.proc.used);
+ cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
+
+ node_affinity.proc.gen = 0;
+ node_affinity.num_core_siblings =
+ cpumask_weight(topology_sibling_cpumask(
+ cpumask_first(&node_affinity.proc.mask)
+ ));
+ node_affinity.num_online_nodes = num_online_nodes();
+ node_affinity.num_online_cpus = num_online_cpus();
+
+ /*
+ * The real cpu mask is part of the affinity struct but it has to be
+ * initialized early. It is needed to calculate the number of user
+ * contexts in set_up_context_variables().
+ */
+ init_real_cpu_mask();
+
+ hfi1_per_node_cntr = kcalloc(num_possible_nodes(),
+ sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
+ if (!hfi1_per_node_cntr)
+ return -ENOMEM;
+
+ while (ids->vendor) {
+ dev = NULL;
+ while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
+ node = pcibus_to_node(dev->bus);
+ if (node < 0)
+ node = numa_node_id();
+
+ hfi1_per_node_cntr[node]++;
+ }
+ ids++;
+ }
+
+ return 0;
+}
+
+void node_affinity_destroy(void)
+{
+ struct list_head *pos, *q;
+ struct hfi1_affinity_node *entry;
+
+ spin_lock(&node_affinity.lock);
+ list_for_each_safe(pos, q, &node_affinity.list) {
+ entry = list_entry(pos, struct hfi1_affinity_node,
+ list);
+ list_del(pos);
+ kfree(entry);
+ }
+ spin_unlock(&node_affinity.lock);
+ kfree(hfi1_per_node_cntr);
+}
+
+static struct hfi1_affinity_node *node_affinity_allocate(int node)
+{
+ struct hfi1_affinity_node *entry;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return NULL;
+ entry->node = node;
+ INIT_LIST_HEAD(&entry->list);
+
+ return entry;
+}
+
+/*
+ * It appends an entry to the list.
+ * It *must* be called with node_affinity.lock held.
+ */
+static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
+{
+ list_add_tail(&entry->list, &node_affinity.list);
+}
+
+/* It must be called with node_affinity.lock held */
+static struct hfi1_affinity_node *node_affinity_lookup(int node)
+{
+ struct list_head *pos;
+ struct hfi1_affinity_node *entry;
+
+ list_for_each(pos, &node_affinity.list) {
+ entry = list_entry(pos, struct hfi1_affinity_node, list);
+ if (entry->node == node)
+ return entry;
+ }
+
+ return NULL;
+}
+
+/*
+ * Interrupt affinity.
+ *
+ * non-rcv avail gets a default mask that
+ * starts as possible cpus with threads reset
+ * and each rcv avail reset.
+ *
+ * rcv avail gets node relative 1 wrapping back
+ * to the node relative 1 as necessary.
+ *
+ */
+int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+{
+ int node = pcibus_to_node(dd->pcidev->bus);
+ struct hfi1_affinity_node *entry;
+ const struct cpumask *local_mask;
+ int curr_cpu, possible, i;
+
+ if (node < 0)
+ node = numa_node_id();
+ dd->node = node;
+
+ local_mask = cpumask_of_node(dd->node);
+ if (cpumask_first(local_mask) >= nr_cpu_ids)
+ local_mask = topology_core_cpumask(0);
+
+ spin_lock(&node_affinity.lock);
+ entry = node_affinity_lookup(dd->node);
+ spin_unlock(&node_affinity.lock);
+
+ /*
+ * If this is the first time this NUMA node's affinity is used,
+ * create an entry in the global affinity structure and initialize it.
+ */
+ if (!entry) {
+ entry = node_affinity_allocate(node);
+ if (!entry) {
+ dd_dev_err(dd,
+ "Unable to allocate global affinity node\n");
+ return -ENOMEM;
+ }
+ init_cpu_mask_set(&entry->def_intr);
+ init_cpu_mask_set(&entry->rcv_intr);
+ cpumask_clear(&entry->general_intr_mask);
+ /* Use the "real" cpu mask of this node as the default */
+ cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
+ local_mask);
+
+ /* fill in the receive list */
+ possible = cpumask_weight(&entry->def_intr.mask);
+ curr_cpu = cpumask_first(&entry->def_intr.mask);
+
+ if (possible == 1) {
+ /* only one CPU, everyone will use it */
+ cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
+ cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+ } else {
+ /*
+ * The general/control context will be the first CPU in
+ * the default list, so it is removed from the default
+ * list and added to the general interrupt list.
+ */
+ cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
+ cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+ curr_cpu = cpumask_next(curr_cpu,
+ &entry->def_intr.mask);
+
+ /*
+ * Remove the remaining kernel receive queues from
+ * the default list and add them to the receive list.
+ */
+ for (i = 0;
+ i < (dd->n_krcv_queues - 1) *
+ hfi1_per_node_cntr[dd->node];
+ i++) {
+ cpumask_clear_cpu(curr_cpu,
+ &entry->def_intr.mask);
+ cpumask_set_cpu(curr_cpu,
+ &entry->rcv_intr.mask);
+ curr_cpu = cpumask_next(curr_cpu,
+ &entry->def_intr.mask);
+ if (curr_cpu >= nr_cpu_ids)
+ break;
+ }
+
+ /*
+ * If there ends up being 0 CPU cores leftover for SDMA
+ * engines, use the same CPU cores as general/control
+ * context.
+ */
+ if (cpumask_weight(&entry->def_intr.mask) == 0)
+ cpumask_copy(&entry->def_intr.mask,
+ &entry->general_intr_mask);
+ }
+
+ spin_lock(&node_affinity.lock);
+ node_affinity_add_tail(entry);
+ spin_unlock(&node_affinity.lock);
+ }
+
+ return 0;
+}
+
+int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
+{
+ int ret;
+ cpumask_var_t diff;
+ struct hfi1_affinity_node *entry;
+ struct cpu_mask_set *set = NULL;
+ struct sdma_engine *sde = NULL;
+ struct hfi1_ctxtdata *rcd = NULL;
+ char extra[64];
+ int cpu = -1;
+
+ extra[0] = '\0';
+ cpumask_clear(&msix->mask);
+
+ ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
+ if (!ret)
+ return -ENOMEM;
+
+ spin_lock(&node_affinity.lock);
+ entry = node_affinity_lookup(dd->node);
+ spin_unlock(&node_affinity.lock);
+
+ switch (msix->type) {
+ case IRQ_SDMA:
+ sde = (struct sdma_engine *)msix->arg;
+ scnprintf(extra, 64, "engine %u", sde->this_idx);
+ set = &entry->def_intr;
+ break;
+ case IRQ_GENERAL:
+ cpu = cpumask_first(&entry->general_intr_mask);
+ break;
+ case IRQ_RCVCTXT:
+ rcd = (struct hfi1_ctxtdata *)msix->arg;
+ if (rcd->ctxt == HFI1_CTRL_CTXT)
+ cpu = cpumask_first(&entry->general_intr_mask);
+ else
+ set = &entry->rcv_intr;
+ scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
+ break;
+ default:
+ dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
+ return -EINVAL;
+ }
+
+ /*
+ * The general and control contexts are placed on a particular
+ * CPU, which is set above. Skip accounting for it. Everything else
+ * finds its CPU here.
+ */
+ if (cpu == -1 && set) {
+ spin_lock(&node_affinity.lock);
+ if (cpumask_equal(&set->mask, &set->used)) {
+ /*
+ * We've used up all the CPUs, bump up the generation
+ * and reset the 'used' map
+ */
+ set->gen++;
+ cpumask_clear(&set->used);
+ }
+ cpumask_andnot(diff, &set->mask, &set->used);
+ cpu = cpumask_first(diff);
+ cpumask_set_cpu(cpu, &set->used);
+ spin_unlock(&node_affinity.lock);
+ }
+
+ switch (msix->type) {
+ case IRQ_SDMA:
+ sde->cpu = cpu;
+ break;
+ case IRQ_GENERAL:
+ case IRQ_RCVCTXT:
+ case IRQ_OTHER:
+ break;
+ }
+
+ cpumask_set_cpu(cpu, &msix->mask);
+ dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n",
+ msix->msix.vector, irq_type_names[msix->type],
+ extra, cpu);
+ irq_set_affinity_hint(msix->msix.vector, &msix->mask);
+
+ free_cpumask_var(diff);
+ return 0;
+}
+
+void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
+ struct hfi1_msix_entry *msix)
+{
+ struct cpu_mask_set *set = NULL;
+ struct hfi1_ctxtdata *rcd;
+ struct hfi1_affinity_node *entry;
+
+ spin_lock(&node_affinity.lock);
+ entry = node_affinity_lookup(dd->node);
+ spin_unlock(&node_affinity.lock);
+
+ switch (msix->type) {
+ case IRQ_SDMA:
+ set = &entry->def_intr;
+ break;
+ case IRQ_GENERAL:
+ /* Don't do accounting for general contexts */
+ break;
+ case IRQ_RCVCTXT:
+ rcd = (struct hfi1_ctxtdata *)msix->arg;
+ /* Don't do accounting for control contexts */
+ if (rcd->ctxt != HFI1_CTRL_CTXT)
+ set = &entry->rcv_intr;
+ break;
+ default:
+ return;
+ }
+
+ if (set) {
+ spin_lock(&node_affinity.lock);
+ cpumask_andnot(&set->used, &set->used, &msix->mask);
+ if (cpumask_empty(&set->used) && set->gen) {
+ set->gen--;
+ cpumask_copy(&set->used, &set->mask);
+ }
+ spin_unlock(&node_affinity.lock);
+ }
+
+ irq_set_affinity_hint(msix->msix.vector, NULL);
+ cpumask_clear(&msix->mask);
+}
+
+/* This should be called with node_affinity.lock held */
+static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
+ struct hfi1_affinity_node_list *affinity)
+{
+ int possible, curr_cpu, i;
+ uint num_cores_per_socket = node_affinity.num_online_cpus /
+ affinity->num_core_siblings /
+ node_affinity.num_online_nodes;
+
+ cpumask_copy(hw_thread_mask, &affinity->proc.mask);
+ if (affinity->num_core_siblings > 0) {
+ /* Removing other siblings not needed for now */
+ possible = cpumask_weight(hw_thread_mask);
+ curr_cpu = cpumask_first(hw_thread_mask);
+ for (i = 0;
+ i < num_cores_per_socket * node_affinity.num_online_nodes;
+ i++)
+ curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
+
+ for (; i < possible; i++) {
+ cpumask_clear_cpu(curr_cpu, hw_thread_mask);
+ curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
+ }
+
+ /* Identifying correct HW threads within physical cores */
+ cpumask_shift_left(hw_thread_mask, hw_thread_mask,
+ num_cores_per_socket *
+ node_affinity.num_online_nodes *
+ hw_thread_no);
+ }
+}
+
+int hfi1_get_proc_affinity(int node)
+{
+ int cpu = -1, ret, i;
+ struct hfi1_affinity_node *entry;
+ cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
+ const struct cpumask *node_mask,
+ *proc_mask = tsk_cpus_allowed(current);
+ struct hfi1_affinity_node_list *affinity = &node_affinity;
+ struct cpu_mask_set *set = &affinity->proc;
+
+ /*
+ * check whether process/context affinity has already
+ * been set
+ */
+ if (cpumask_weight(proc_mask) == 1) {
+ hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
+ current->pid, current->comm,
+ cpumask_pr_args(proc_mask));
+ /*
+ * Mark the pre-set CPU as used. This is atomic so we don't
+ * need the lock
+ */
+ cpu = cpumask_first(proc_mask);
+ cpumask_set_cpu(cpu, &set->used);
+ goto done;
+ } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
+ hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
+ current->pid, current->comm,
+ cpumask_pr_args(proc_mask));
+ goto done;
+ }
+
+ /*
+ * The process does not have a preset CPU affinity so find one to
+ * recommend using the following algorithm:
+ *
+ * For each user process that is opening a context on HFI Y:
+ * a) If all cores are filled, reinitialize the bitmask
+ * b) Fill real cores first, then HT cores (First set of HT
+ * cores on all physical cores, then second set of HT core,
+ * and, so on) in the following order:
+ *
+ * 1. Same NUMA node as HFI Y and not running an IRQ
+ * handler
+ * 2. Same NUMA node as HFI Y and running an IRQ handler
+ * 3. Different NUMA node to HFI Y and not running an IRQ
+ * handler
+ * 4. Different NUMA node to HFI Y and running an IRQ
+ * handler
+ * c) Mark core as filled in the bitmask. As user processes are
+ * done, clear cores from the bitmask.
+ */
+
+ ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
+ if (!ret)
+ goto done;
+ ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
+ if (!ret)
+ goto free_diff;
+ ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
+ if (!ret)
+ goto free_hw_thread_mask;
+ ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
+ if (!ret)
+ goto free_available_mask;
+
+ spin_lock(&affinity->lock);
+ /*
+ * If we've used all available HW threads, clear the mask and start
+ * overloading.
+ */
+ if (cpumask_equal(&set->mask, &set->used)) {
+ set->gen++;
+ cpumask_clear(&set->used);
+ }
+
+ /*
+ * If NUMA node has CPUs used by interrupt handlers, include them in the
+ * interrupt handler mask.
+ */
+ entry = node_affinity_lookup(node);
+ if (entry) {
+ cpumask_copy(intrs_mask, (entry->def_intr.gen ?
+ &entry->def_intr.mask :
+ &entry->def_intr.used));
+ cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
+ &entry->rcv_intr.mask :
+ &entry->rcv_intr.used));
+ cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
+ }
+ hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
+ cpumask_pr_args(intrs_mask));
+
+ cpumask_copy(hw_thread_mask, &set->mask);
+
+ /*
+ * If HT cores are enabled, identify which HW threads within the
+ * physical cores should be used.
+ */
+ if (affinity->num_core_siblings > 0) {
+ for (i = 0; i < affinity->num_core_siblings; i++) {
+ find_hw_thread_mask(i, hw_thread_mask, affinity);
+
+ /*
+ * If there's at least one available core for this HW
+ * thread number, stop looking for a core.
+ *
+ * diff will always be not empty at least once in this
+ * loop as the used mask gets reset when
+ * (set->mask == set->used) before this loop.
+ */
+ cpumask_andnot(diff, hw_thread_mask, &set->used);
+ if (!cpumask_empty(diff))
+ break;
+ }
+ }
+ hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
+ cpumask_pr_args(hw_thread_mask));
+
+ node_mask = cpumask_of_node(node);
+ hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
+ cpumask_pr_args(node_mask));
+
+ /* Get cpumask of available CPUs on preferred NUMA */
+ cpumask_and(available_mask, hw_thread_mask, node_mask);
+ cpumask_andnot(available_mask, available_mask, &set->used);
+ hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
+ cpumask_pr_args(available_mask));
+
+ /*
+ * At first, we don't want to place processes on the same
+ * CPUs as interrupt handlers. Then, CPUs running interrupt
+ * handlers are used.
+ *
+ * 1) If diff is not empty, then there are CPUs not running
+ * non-interrupt handlers available, so diff gets copied
+ * over to available_mask.
+ * 2) If diff is empty, then all CPUs not running interrupt
+ * handlers are taken, so available_mask contains all
+ * available CPUs running interrupt handlers.
+ * 3) If available_mask is empty, then all CPUs on the
+ * preferred NUMA node are taken, so other NUMA nodes are
+ * used for process assignments using the same method as
+ * the preferred NUMA node.
+ */
+ cpumask_andnot(diff, available_mask, intrs_mask);
+ if (!cpumask_empty(diff))
+ cpumask_copy(available_mask, diff);
+
+ /* If we don't have CPUs on the preferred node, use other NUMA nodes */
+ if (cpumask_empty(available_mask)) {
+ cpumask_andnot(available_mask, hw_thread_mask, &set->used);
+ /* Excluding preferred NUMA cores */
+ cpumask_andnot(available_mask, available_mask, node_mask);
+ hfi1_cdbg(PROC,
+ "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
+ cpumask_pr_args(available_mask));
+
+ /*
+ * At first, we don't want to place processes on the same
+ * CPUs as interrupt handlers.
+ */
+ cpumask_andnot(diff, available_mask, intrs_mask);
+ if (!cpumask_empty(diff))
+ cpumask_copy(available_mask, diff);
+ }
+ hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
+ cpumask_pr_args(available_mask));
+
+ cpu = cpumask_first(available_mask);
+ if (cpu >= nr_cpu_ids) /* empty */
+ cpu = -1;
+ else
+ cpumask_set_cpu(cpu, &set->used);
+ spin_unlock(&affinity->lock);
+ hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
+
+ free_cpumask_var(intrs_mask);
+free_available_mask:
+ free_cpumask_var(available_mask);
+free_hw_thread_mask:
+ free_cpumask_var(hw_thread_mask);
+free_diff:
+ free_cpumask_var(diff);
+done:
+ return cpu;
+}
+
+void hfi1_put_proc_affinity(int cpu)
+{
+ struct hfi1_affinity_node_list *affinity = &node_affinity;
+ struct cpu_mask_set *set = &affinity->proc;
+
+ if (cpu < 0)
+ return;
+ spin_lock(&affinity->lock);
+ cpumask_clear_cpu(cpu, &set->used);
+ hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
+ if (cpumask_empty(&set->used) && set->gen) {
+ set->gen--;
+ cpumask_copy(&set->used, &set->mask);
+ }
+ spin_unlock(&affinity->lock);
+}
+
+/* Prevents concurrent reads and writes of the sdma_affinity attrib */
+static DEFINE_MUTEX(sdma_affinity_mutex);
+
+int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf,
+ size_t count)
+{
+ struct hfi1_affinity_node *entry;
+ cpumask_var_t mask;
+ int ret, i;
+
+ spin_lock(&node_affinity.lock);
+ entry = node_affinity_lookup(dd->node);
+ spin_unlock(&node_affinity.lock);
+
+ if (!entry)
+ return -EINVAL;
+
+ ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
+ if (!ret)
+ return -ENOMEM;
+
+ ret = cpulist_parse(buf, mask);
+ if (ret)
+ goto out;
+
+ if (!cpumask_subset(mask, cpu_online_mask) || cpumask_empty(mask)) {
+ dd_dev_warn(dd, "Invalid CPU mask\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mutex_lock(&sdma_affinity_mutex);
+ /* reset the SDMA interrupt affinity details */
+ init_cpu_mask_set(&entry->def_intr);
+ cpumask_copy(&entry->def_intr.mask, mask);
+ /*
+ * Reassign the affinity for each SDMA interrupt.
+ */
+ for (i = 0; i < dd->num_msix_entries; i++) {
+ struct hfi1_msix_entry *msix;
+
+ msix = &dd->msix_entries[i];
+ if (msix->type != IRQ_SDMA)
+ continue;
+
+ ret = hfi1_get_irq_affinity(dd, msix);
+
+ if (ret)
+ break;
+ }
+ mutex_unlock(&sdma_affinity_mutex);
+out:
+ free_cpumask_var(mask);
+ return ret ? ret : strnlen(buf, PAGE_SIZE);
+}
+
+int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf)
+{
+ struct hfi1_affinity_node *entry;
+
+ spin_lock(&node_affinity.lock);
+ entry = node_affinity_lookup(dd->node);
+ spin_unlock(&node_affinity.lock);
+
+ if (!entry)
+ return -EINVAL;
+
+ mutex_lock(&sdma_affinity_mutex);
+ cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask);
+ mutex_unlock(&sdma_affinity_mutex);
+ return strnlen(buf, PAGE_SIZE);
+}
diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h
new file mode 100644
index 000000000000..8879cf7a8cac
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/affinity.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_AFFINITY_H
+#define _HFI1_AFFINITY_H
+
+#include "hfi.h"
+
+enum irq_type {
+ IRQ_SDMA,
+ IRQ_RCVCTXT,
+ IRQ_GENERAL,
+ IRQ_OTHER
+};
+
+/* Can be used for both memory and cpu */
+enum affinity_flags {
+ AFF_AUTO,
+ AFF_NUMA_LOCAL,
+ AFF_DEV_LOCAL,
+ AFF_IRQ_LOCAL
+};
+
+struct cpu_mask_set {
+ struct cpumask mask;
+ struct cpumask used;
+ uint gen;
+};
+
+struct hfi1_affinity {
+ struct cpu_mask_set def_intr;
+ struct cpu_mask_set rcv_intr;
+ struct cpumask real_cpu_mask;
+ /* spin lock to protect affinity struct */
+ spinlock_t lock;
+};
+
+struct hfi1_msix_entry;
+
+/* Initialize non-HT cpu cores mask */
+void init_real_cpu_mask(void);
+/* Initialize driver affinity data */
+int hfi1_dev_affinity_init(struct hfi1_devdata *);
+/*
+ * Set IRQ affinity to a CPU. The function will determine the
+ * CPU and set the affinity to it.
+ */
+int hfi1_get_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
+/*
+ * Remove the IRQ's CPU affinity. This function also updates
+ * any internal CPU tracking data
+ */
+void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
+/*
+ * Determine a CPU affinity for a user process, if the process does not
+ * have an affinity set yet.
+ */
+int hfi1_get_proc_affinity(int);
+/* Release a CPU used by a user process. */
+void hfi1_put_proc_affinity(int);
+
+int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf);
+int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf,
+ size_t count);
+
+struct hfi1_affinity_node {
+ int node;
+ struct cpu_mask_set def_intr;
+ struct cpu_mask_set rcv_intr;
+ struct cpumask general_intr_mask;
+ struct list_head list;
+};
+
+struct hfi1_affinity_node_list {
+ struct list_head list;
+ struct cpumask real_cpu_mask;
+ struct cpu_mask_set proc;
+ int num_core_siblings;
+ int num_online_nodes;
+ int num_online_cpus;
+ /* protect affinity node list */
+ spinlock_t lock;
+};
+
+int node_affinity_init(void);
+void node_affinity_destroy(void);
+extern struct hfi1_affinity_node_list node_affinity;
+
+#endif /* _HFI1_AFFINITY_H */
diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h
new file mode 100644
index 000000000000..0d58fe3b49b5
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/aspm.h
@@ -0,0 +1,309 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _ASPM_H
+#define _ASPM_H
+
+#include "hfi.h"
+
+extern uint aspm_mode;
+
+enum aspm_mode {
+ ASPM_MODE_DISABLED = 0, /* ASPM always disabled, performance mode */
+ ASPM_MODE_ENABLED = 1, /* ASPM always enabled, power saving mode */
+ ASPM_MODE_DYNAMIC = 2, /* ASPM enabled/disabled dynamically */
+};
+
+/* Time after which the timer interrupt will re-enable ASPM */
+#define ASPM_TIMER_MS 1000
+/* Time for which interrupts are ignored after a timer has been scheduled */
+#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
+/* Two interrupts within this time trigger ASPM disable */
+#define ASPM_TRIGGER_MS 1
+#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
+#define ASPM_L1_SUPPORTED(reg) \
+ (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+
+static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent = dd->pcidev->bus->self;
+ u32 up, dn;
+
+ /*
+ * If the driver does not have access to the upstream component,
+ * it cannot support ASPM L1 at all.
+ */
+ if (!parent)
+ return false;
+
+ pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
+ dn = ASPM_L1_SUPPORTED(dn);
+
+ pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
+ up = ASPM_L1_SUPPORTED(up);
+
+ /* ASPM works on A-step but is reported as not supported */
+ return (!!dn || is_ax(dd)) && !!up;
+}
+
+/* Set L1 entrance latency for slower entry to L1 */
+static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
+{
+ u32 l1_ent_lat = 0x4u;
+ u32 reg32;
+
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, &reg32);
+ reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
+ reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
+}
+
+static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent = dd->pcidev->bus->self;
+
+ /*
+ * If the driver does not have access to the upstream component,
+ * it cannot support ASPM L1 at all.
+ */
+ if (!parent)
+ return;
+
+ /* Enable ASPM L1 first in upstream component and then downstream */
+ pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_ASPMC,
+ PCI_EXP_LNKCTL_ASPM_L1);
+ pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_ASPMC,
+ PCI_EXP_LNKCTL_ASPM_L1);
+}
+
+static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent = dd->pcidev->bus->self;
+
+ /* Disable ASPM L1 first in downstream component and then upstream */
+ pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_ASPMC, 0x0);
+ if (parent)
+ pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_ASPMC, 0x0);
+}
+
+static inline void aspm_enable(struct hfi1_devdata *dd)
+{
+ if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
+ !dd->aspm_supported)
+ return;
+
+ aspm_hw_enable_l1(dd);
+ dd->aspm_enabled = true;
+}
+
+static inline void aspm_disable(struct hfi1_devdata *dd)
+{
+ if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
+ return;
+
+ aspm_hw_disable_l1(dd);
+ dd->aspm_enabled = false;
+}
+
+static inline void aspm_disable_inc(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->aspm_lock, flags);
+ aspm_disable(dd);
+ atomic_inc(&dd->aspm_disabled_cnt);
+ spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+static inline void aspm_enable_dec(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->aspm_lock, flags);
+ if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
+ aspm_enable(dd);
+ spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+/* ASPM processing for each receive context interrupt */
+static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
+{
+ bool restart_timer;
+ bool close_interrupts;
+ unsigned long flags;
+ ktime_t now, prev;
+
+ /* Quickest exit for minimum impact */
+ if (!rcd->aspm_intr_supported)
+ return;
+
+ spin_lock_irqsave(&rcd->aspm_lock, flags);
+ /* PSM contexts are open */
+ if (!rcd->aspm_intr_enable)
+ goto unlock;
+
+ prev = rcd->aspm_ts_last_intr;
+ now = ktime_get();
+ rcd->aspm_ts_last_intr = now;
+
+ /* An interrupt pair close together in time */
+ close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
+
+ /* Don't push out our timer till this much time has elapsed */
+ restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
+ ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
+ restart_timer = restart_timer && close_interrupts;
+
+ /* Disable ASPM and schedule timer */
+ if (rcd->aspm_enabled && close_interrupts) {
+ aspm_disable_inc(rcd->dd);
+ rcd->aspm_enabled = false;
+ restart_timer = true;
+ }
+
+ if (restart_timer) {
+ mod_timer(&rcd->aspm_timer,
+ jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
+ rcd->aspm_ts_timer_sched = now;
+ }
+unlock:
+ spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Timer function for re-enabling ASPM in the absence of interrupt activity */
+static inline void aspm_ctx_timer_function(unsigned long data)
+{
+ struct hfi1_ctxtdata *rcd = (struct hfi1_ctxtdata *)data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rcd->aspm_lock, flags);
+ aspm_enable_dec(rcd->dd);
+ rcd->aspm_enabled = true;
+ spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Disable interrupt processing for verbs contexts when PSM contexts are open */
+static inline void aspm_disable_all(struct hfi1_devdata *dd)
+{
+ struct hfi1_ctxtdata *rcd;
+ unsigned long flags;
+ unsigned i;
+
+ for (i = 0; i < dd->first_user_ctxt; i++) {
+ rcd = dd->rcd[i];
+ del_timer_sync(&rcd->aspm_timer);
+ spin_lock_irqsave(&rcd->aspm_lock, flags);
+ rcd->aspm_intr_enable = false;
+ spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+ }
+
+ aspm_disable(dd);
+ atomic_set(&dd->aspm_disabled_cnt, 0);
+}
+
+/* Re-enable interrupt processing for verbs contexts */
+static inline void aspm_enable_all(struct hfi1_devdata *dd)
+{
+ struct hfi1_ctxtdata *rcd;
+ unsigned long flags;
+ unsigned i;
+
+ aspm_enable(dd);
+
+ if (aspm_mode != ASPM_MODE_DYNAMIC)
+ return;
+
+ for (i = 0; i < dd->first_user_ctxt; i++) {
+ rcd = dd->rcd[i];
+ spin_lock_irqsave(&rcd->aspm_lock, flags);
+ rcd->aspm_intr_enable = true;
+ rcd->aspm_enabled = true;
+ spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+ }
+}
+
+static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
+{
+ spin_lock_init(&rcd->aspm_lock);
+ setup_timer(&rcd->aspm_timer, aspm_ctx_timer_function,
+ (unsigned long)rcd);
+ rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
+ aspm_mode == ASPM_MODE_DYNAMIC &&
+ rcd->ctxt < rcd->dd->first_user_ctxt;
+}
+
+static inline void aspm_init(struct hfi1_devdata *dd)
+{
+ unsigned i;
+
+ spin_lock_init(&dd->aspm_lock);
+ dd->aspm_supported = aspm_hw_l1_supported(dd);
+
+ for (i = 0; i < dd->first_user_ctxt; i++)
+ aspm_ctx_init(dd->rcd[i]);
+
+ /* Start with ASPM disabled */
+ aspm_hw_set_l1_ent_latency(dd);
+ dd->aspm_enabled = false;
+ aspm_hw_disable_l1(dd);
+
+ /* Now turn on ASPM if configured */
+ aspm_enable_all(dd);
+}
+
+static inline void aspm_exit(struct hfi1_devdata *dd)
+{
+ aspm_disable_all(dd);
+
+ /* Turn on ASPM on exit to conserve power */
+ aspm_enable(dd);
+}
+
+#endif /* _ASPM_H */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
new file mode 100644
index 000000000000..cc38004cea42
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -0,0 +1,14886 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the code that is specific to the HFI chip
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "mad.h"
+#include "pio.h"
+#include "sdma.h"
+#include "eprom.h"
+#include "efivar.h"
+#include "platform.h"
+#include "aspm.h"
+#include "affinity.h"
+
+#define NUM_IB_PORTS 1
+
+uint kdeth_qp;
+module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
+MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
+
+uint num_vls = HFI1_MAX_VLS_SUPPORTED;
+module_param(num_vls, uint, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+/*
+ * Default time to aggregate two 10K packets from the idle state
+ * (timer not running). The timer starts at the end of the first packet,
+ * so only the time for one 10K packet and header plus a bit extra is needed.
+ * 10 * 1024 + 64 header byte = 10304 byte
+ * 10304 byte / 12.5 GB/s = 824.32ns
+ */
+uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
+module_param(rcv_intr_timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
+
+uint rcv_intr_count = 16; /* same as qib */
+module_param(rcv_intr_count, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
+
+ushort link_crc_mask = SUPPORTED_CRCS;
+module_param(link_crc_mask, ushort, S_IRUGO);
+MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
+
+uint loopback;
+module_param_named(loopback, loopback, uint, S_IRUGO);
+MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
+
+/* Other driver tunables */
+uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
+static ushort crc_14b_sideband = 1;
+static uint use_flr = 1;
+uint quick_linkup; /* skip LNI */
+
+struct flag_table {
+ u64 flag; /* the flag */
+ char *str; /* description string */
+ u16 extra; /* extra information */
+ u16 unused0;
+ u32 unused1;
+};
+
+/* str must be a string constant */
+#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
+#define FLAG_ENTRY0(str, flag) {flag, str, 0}
+
+/* Send Error Consequences */
+#define SEC_WRITE_DROPPED 0x1
+#define SEC_PACKET_DROPPED 0x2
+#define SEC_SC_HALTED 0x4 /* per-context only */
+#define SEC_SPC_FREEZE 0x8 /* per-HFI only */
+
+#define DEFAULT_KRCVQS 2
+#define MIN_KERNEL_KCTXTS 2
+#define FIRST_KERNEL_KCTXT 1
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES 256
+#define NUM_MAP_REGS 32
+
+/* Bit offset into the GUID which carries HFI id information */
+#define GUID_HFI_INDEX_SHIFT 39
+
+/* extract the emulation revision */
+#define emulator_rev(dd) ((dd)->irev >> 8)
+/* parallel and serial emulation versions are 3 and 4 respectively */
+#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
+#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
+
+/* RSM fields */
+
+/* packet type */
+#define IB_PACKET_TYPE 2ull
+#define QW_SHIFT 6ull
+/* QPN[7..1] */
+#define QPN_WIDTH 7ull
+
+/* LRH.BTH: QW 0, OFFSET 48 - for match */
+#define LRH_BTH_QW 0ull
+#define LRH_BTH_BIT_OFFSET 48ull
+#define LRH_BTH_OFFSET(off) ((LRH_BTH_QW << QW_SHIFT) | (off))
+#define LRH_BTH_MATCH_OFFSET LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
+#define LRH_BTH_SELECT
+#define LRH_BTH_MASK 3ull
+#define LRH_BTH_VALUE 2ull
+
+/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
+#define LRH_SC_QW 0ull
+#define LRH_SC_BIT_OFFSET 56ull
+#define LRH_SC_OFFSET(off) ((LRH_SC_QW << QW_SHIFT) | (off))
+#define LRH_SC_MATCH_OFFSET LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
+#define LRH_SC_MASK 128ull
+#define LRH_SC_VALUE 0ull
+
+/* SC[n..0] QW 0, OFFSET 60 - for select */
+#define LRH_SC_SELECT_OFFSET ((LRH_SC_QW << QW_SHIFT) | (60ull))
+
+/* QPN[m+n:1] QW 1, OFFSET 1 */
+#define QPN_SELECT_OFFSET ((1ull << QW_SHIFT) | (1ull))
+
+/* defines to build power on SC2VL table */
+#define SC2VL_VAL( \
+ num, \
+ sc0, sc0val, \
+ sc1, sc1val, \
+ sc2, sc2val, \
+ sc3, sc3val, \
+ sc4, sc4val, \
+ sc5, sc5val, \
+ sc6, sc6val, \
+ sc7, sc7val) \
+( \
+ ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
+ ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
+ ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
+ ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
+ ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
+ ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
+ ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
+ ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT) \
+)
+
+#define DC_SC_VL_VAL( \
+ range, \
+ e0, e0val, \
+ e1, e1val, \
+ e2, e2val, \
+ e3, e3val, \
+ e4, e4val, \
+ e5, e5val, \
+ e6, e6val, \
+ e7, e7val, \
+ e8, e8val, \
+ e9, e9val, \
+ e10, e10val, \
+ e11, e11val, \
+ e12, e12val, \
+ e13, e13val, \
+ e14, e14val, \
+ e15, e15val) \
+( \
+ ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
+ ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
+ ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
+ ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
+ ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
+ ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
+ ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
+ ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
+ ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
+ ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
+ ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
+ ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
+ ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
+ ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
+ ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
+ ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
+)
+
+/* all CceStatus sub-block freeze bits */
+#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
+ | CCE_STATUS_RXE_FROZE_SMASK \
+ | CCE_STATUS_TXE_FROZE_SMASK \
+ | CCE_STATUS_TXE_PIO_FROZE_SMASK)
+/* all CceStatus sub-block TXE pause bits */
+#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
+ | CCE_STATUS_TXE_PAUSED_SMASK \
+ | CCE_STATUS_SDMA_PAUSED_SMASK)
+/* all CceStatus sub-block RXE pause bits */
+#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
+
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
+/*
+ * CCE Error flags.
+ */
+static struct flag_table cce_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CceCsrParityErr",
+ CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr",
+ CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr",
+ CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
+ CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr",
+ CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
+/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr",
+ CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
+ CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr",
+ CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
+ CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+ CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+ CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
+ CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
+/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
+ CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
+/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+ CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+ CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+ CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
+/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+ CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
+/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+ CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
+/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr",
+ CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
+/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr",
+ CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
+/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr",
+ CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
+/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr",
+ CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
+/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr",
+ CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
+/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr",
+ CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
+/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr",
+ CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
+/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr",
+ CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
+/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr",
+ CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
+/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr",
+ CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr",
+ CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY0("PcicReceiveParityErr",
+ CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
+/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr",
+ CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
+/*31*/ FLAG_ENTRY0("LATriggered",
+ CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
+/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr",
+ CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
+/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr",
+ CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
+/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
+ CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
+ CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
+/*36*/ FLAG_ENTRY0("CceMsixTableCorErr",
+ CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
+/*37*/ FLAG_ENTRY0("CceMsixTableUncErr",
+ CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
+/*38*/ FLAG_ENTRY0("CceIntMapCorErr",
+ CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
+/*39*/ FLAG_ENTRY0("CceIntMapUncErr",
+ CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
+/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr",
+ CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
+/*41-63 reserved*/
+};
+
+/*
+ * Misc Error flags
+ */
+#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
+static struct flag_table misc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
+/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
+/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
+/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
+/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
+/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
+/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
+/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
+/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
+/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
+/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
+};
+
+/*
+ * TXE PIO Error flags and consequences
+ */
+static struct flag_table pio_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("PioWriteBadCtxt",
+ SEC_WRITE_DROPPED,
+ SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("PioWriteAddrParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("PioCsrParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("PioSbMemFifo0",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("PioSbMemFifo1",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
+/* 5*/ FLAG_ENTRY("PioPccFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY("PioPecFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY("PioSmPktResetParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
+/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
+/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor",
+ 0,
+ SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor",
+ 0,
+ SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY("PioCreditRetFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
+/*16*/ FLAG_ENTRY("PioPpmcPblFifo",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
+/*17*/ FLAG_ENTRY("PioInitSmIn",
+ 0,
+ SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
+/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
+/*19*/ FLAG_ENTRY("PioHostAddrMemUnc",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
+/*20*/ FLAG_ENTRY("PioHostAddrMemCor",
+ 0,
+ SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
+/*21*/ FLAG_ENTRY("PioWriteDataParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
+/*22*/ FLAG_ENTRY("PioStateMachine",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
+/*23*/ FLAG_ENTRY("PioWriteQwValidParity",
+ SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
+/*24*/ FLAG_ENTRY("PioBlockQwCountParity",
+ SEC_WRITE_DROPPED | SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
+/*25*/ FLAG_ENTRY("PioVlfVlLenParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
+/*26*/ FLAG_ENTRY("PioVlfSopParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
+/*27*/ FLAG_ENTRY("PioVlFifoParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY("PioPpmcSopLen",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
+/*30-31 reserved*/
+/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
+/*33*/ FLAG_ENTRY("PioLastReturnedCntParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
+/*34*/ FLAG_ENTRY("PioPccSopHeadParity",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr",
+ SEC_SPC_FREEZE,
+ SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
+/*36-63 reserved*/
+};
+
+/* TXE PIO errors that cause an SPC freeze */
+#define ALL_PIO_FREEZE_ERR \
+ (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+ | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
+
+/*
+ * TXE SDMA Error flags
+ */
+static struct flag_table sdma_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr",
+ SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr",
+ SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
+ SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
+ SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
+/*04-63 reserved*/
+};
+
+/* TXE SDMA errors that cause an SPC freeze */
+#define ALL_SDMA_FREEZE_ERR \
+ (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
+ | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
+ | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
+
+/* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */
+#define PORT_DISCARD_EGRESS_ERRS \
+ (SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \
+ | SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \
+ | SEND_EGRESS_ERR_INFO_VL_ERR_SMASK)
+
+/*
+ * TXE Egress Error flags
+ */
+#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
+static struct flag_table egress_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
+/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
+/* 2 reserved */
+/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
+ SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
+/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
+/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
+/* 6 reserved */
+/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr",
+ SEES(TX_PIO_LAUNCH_INTF_PARITY)),
+/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
+ SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
+/* 9-10 reserved */
+/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
+ SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
+/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
+/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
+/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
+/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
+/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
+ SEES(TX_SDMA0_DISALLOWED_PACKET)),
+/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
+ SEES(TX_SDMA1_DISALLOWED_PACKET)),
+/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
+ SEES(TX_SDMA2_DISALLOWED_PACKET)),
+/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
+ SEES(TX_SDMA3_DISALLOWED_PACKET)),
+/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
+ SEES(TX_SDMA4_DISALLOWED_PACKET)),
+/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
+ SEES(TX_SDMA5_DISALLOWED_PACKET)),
+/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
+ SEES(TX_SDMA6_DISALLOWED_PACKET)),
+/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
+ SEES(TX_SDMA7_DISALLOWED_PACKET)),
+/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
+ SEES(TX_SDMA8_DISALLOWED_PACKET)),
+/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
+ SEES(TX_SDMA9_DISALLOWED_PACKET)),
+/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
+ SEES(TX_SDMA10_DISALLOWED_PACKET)),
+/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
+ SEES(TX_SDMA11_DISALLOWED_PACKET)),
+/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
+ SEES(TX_SDMA12_DISALLOWED_PACKET)),
+/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
+ SEES(TX_SDMA13_DISALLOWED_PACKET)),
+/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
+ SEES(TX_SDMA14_DISALLOWED_PACKET)),
+/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
+ SEES(TX_SDMA15_DISALLOWED_PACKET)),
+/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
+/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
+/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
+/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
+/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
+/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
+/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
+/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
+/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
+ SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
+/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
+/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
+/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
+/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
+/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
+/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
+/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
+/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
+/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
+/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
+/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
+/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
+/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
+/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
+/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
+/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
+/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
+/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
+/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
+/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
+/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
+/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
+ SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
+/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
+ SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
+};
+
+/*
+ * TXE Egress Error Info flags
+ */
+#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
+static struct flag_table egress_err_info_flags[] = {
+/* 0*/ FLAG_ENTRY0("Reserved", 0ull),
+/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)),
+/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
+/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
+/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
+/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
+/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)),
+/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
+/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)),
+/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
+/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
+/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
+/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
+/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
+/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
+/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
+/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
+/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
+/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
+/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
+};
+
+/* TXE Egress errors that cause an SPC freeze */
+#define ALL_TXE_EGRESS_FREEZE_ERR \
+ (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
+ | SEES(TX_PIO_LAUNCH_INTF_PARITY) \
+ | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
+ | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
+ | SEES(TX_LAUNCH_CSR_PARITY) \
+ | SEES(TX_SBRD_CTL_CSR_PARITY) \
+ | SEES(TX_CONFIG_PARITY) \
+ | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
+ | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
+ | SEES(TX_CREDIT_RETURN_PARITY))
+
+/*
+ * TXE Send error flags
+ */
+#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
+static struct flag_table send_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
+};
+
+/*
+ * TXE Send Context Error flags and consequences
+ */
+static struct flag_table sc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("InconsistentSop",
+ SEC_PACKET_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("DisallowedPacket",
+ SEC_PACKET_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("WriteCrossesBoundary",
+ SEC_WRITE_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("WriteOverflow",
+ SEC_WRITE_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("WriteOutOfBounds",
+ SEC_WRITE_DROPPED | SEC_SC_HALTED,
+ SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
+/* 5-63 reserved*/
+};
+
+/*
+ * RXE Receive Error flags
+ */
+#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
+static struct flag_table rxe_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
+/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
+/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
+/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
+/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
+/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
+/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
+/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
+/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
+/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
+/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
+/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
+/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
+/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
+/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
+/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
+/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
+ RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
+/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
+/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
+/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr",
+ RXES(RBUF_BLOCK_LIST_READ_UNC)),
+/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr",
+ RXES(RBUF_BLOCK_LIST_READ_COR)),
+/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
+ RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
+/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
+ RXES(RBUF_CSR_QENT_CNT_PARITY)),
+/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
+ RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
+/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
+ RXES(RBUF_CSR_QVLD_BIT_PARITY)),
+/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
+/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
+/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
+ RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
+/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
+/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
+/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
+/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
+/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
+/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
+/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
+/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
+ RXES(RBUF_FL_INITDONE_PARITY)),
+/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
+ RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
+/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
+/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
+/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
+/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
+ RXES(LOOKUP_DES_PART1_UNC_COR)),
+/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr",
+ RXES(LOOKUP_DES_PART2_PARITY)),
+/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
+/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
+/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
+/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
+/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
+/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
+/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
+/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
+/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
+/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
+/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
+/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
+/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
+/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
+/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
+/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
+/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
+/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
+/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
+/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
+/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
+/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
+};
+
+/* RXE errors that will trigger an SPC freeze */
+#define ALL_RXE_FREEZE_ERR \
+ (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
+ | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
+
+#define RXE_FREEZE_ABORT_MASK \
+ (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
+ RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
+ RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
+
+/*
+ * DCC Error Flags
+ */
+#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
+static struct flag_table dcc_err_flags[] = {
+ FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
+ FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
+ FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
+ FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
+ FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
+ FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
+ FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
+ FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
+ FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
+ FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
+ FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
+ FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
+ FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
+ FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
+ FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
+ FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
+ FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
+ FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
+ FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
+ FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
+ FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
+ FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
+ FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
+ FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
+ FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
+ FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
+ FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
+ FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
+ FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
+ FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
+ FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
+ FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
+ FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
+ FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
+ FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
+ FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
+ FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
+ FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
+ FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
+ FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
+ FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
+ FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
+ FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
+ FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
+ FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
+ FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
+};
+
+/*
+ * LCB error flags
+ */
+#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
+static struct flag_table lcb_err_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
+/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
+/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
+/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
+ LCBE(ALL_LNS_FAILED_REINIT_TEST)),
+/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
+/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
+/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
+/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
+/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
+/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
+/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
+/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
+/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
+/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
+ LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
+/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
+/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
+/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
+/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
+/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
+/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
+ LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
+/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
+/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
+/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
+/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
+/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
+/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
+/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
+ LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
+/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
+/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
+ LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
+/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
+ LCBE(REDUNDANT_FLIT_PARITY_ERR))
+};
+
+/*
+ * DC8051 Error Flags
+ */
+#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
+static struct flag_table dc8051_err_flags[] = {
+ FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
+ FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
+ FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
+ FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
+ FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
+ FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
+ FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
+ FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
+ FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
+ D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
+ FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
+};
+
+/*
+ * DC8051 Information Error flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
+ */
+static struct flag_table dc8051_info_err_flags[] = {
+ FLAG_ENTRY0("Spico ROM check failed", SPICO_ROM_FAILED),
+ FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME),
+ FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET),
+ FLAG_ENTRY0("Serdes internal loopback failure",
+ FAILED_SERDES_INTERNAL_LOOPBACK),
+ FLAG_ENTRY0("Failed SerDes init", FAILED_SERDES_INIT),
+ FLAG_ENTRY0("Failed LNI(Polling)", FAILED_LNI_POLLING),
+ FLAG_ENTRY0("Failed LNI(Debounce)", FAILED_LNI_DEBOUNCE),
+ FLAG_ENTRY0("Failed LNI(EstbComm)", FAILED_LNI_ESTBCOMM),
+ FLAG_ENTRY0("Failed LNI(OptEq)", FAILED_LNI_OPTEQ),
+ FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
+ FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
+ FLAG_ENTRY0("Failed LNI(ConfigLT)", FAILED_LNI_CONFIGLT),
+ FLAG_ENTRY0("Host Handshake Timeout", HOST_HANDSHAKE_TIMEOUT)
+};
+
+/*
+ * DC8051 Information Host Information flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
+ */
+static struct flag_table dc8051_info_host_msg_flags[] = {
+ FLAG_ENTRY0("Host request done", 0x0001),
+ FLAG_ENTRY0("BC SMA message", 0x0002),
+ FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
+ FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
+ FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
+ FLAG_ENTRY0("External device config request", 0x0020),
+ FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
+ FLAG_ENTRY0("LinkUp achieved", 0x0080),
+ FLAG_ENTRY0("Link going down", 0x0100),
+};
+
+static u32 encoded_size(u32 size);
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+ u8 *continuous);
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+ u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+ u8 *remote_tx_rate, u16 *link_widths);
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+ u8 *flag_bits, u16 *link_widths);
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+ u8 *device_rev);
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
+static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
+ u8 *tx_polarity_inversion,
+ u8 *rx_polarity_inversion, u8 *max_rate);
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+ unsigned int context, u64 err_status);
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
+static void handle_dcc_err(struct hfi1_devdata *dd,
+ unsigned int context, u64 err_status);
+static void handle_lcb_err(struct hfi1_devdata *dd,
+ unsigned int context, u64 err_status);
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void set_partition_keys(struct hfi1_pportdata *);
+static const char *link_state_name(u32 state);
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
+ u32 state);
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+ u64 *out_data);
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
+static int thermal_init(struct hfi1_devdata *dd);
+
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+ int msecs);
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
+static void handle_temp_err(struct hfi1_devdata *);
+static void dc_shutdown(struct hfi1_devdata *);
+static void dc_start(struct hfi1_devdata *);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+ unsigned int *np);
+static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd);
+
+/*
+ * Error interrupt table entry. This is used as input to the interrupt
+ * "clear down" routine used for all second tier error interrupt register.
+ * Second tier interrupt registers have a single bit representing them
+ * in the top-level CceIntStatus.
+ */
+struct err_reg_info {
+ u32 status; /* status CSR offset */
+ u32 clear; /* clear CSR offset */
+ u32 mask; /* mask CSR offset */
+ void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
+ const char *desc;
+};
+
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+
+/*
+ * Helpers for building HFI and DC error interrupt table entries. Different
+ * helpers are needed because of inconsistent register names.
+ */
+#define EE(reg, handler, desc) \
+ { reg##_STATUS, reg##_CLEAR, reg##_MASK, \
+ handler, desc }
+#define DC_EE1(reg, handler, desc) \
+ { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
+#define DC_EE2(reg, handler, desc) \
+ { reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
+
+/*
+ * Table of the "misc" grouping of error interrupts. Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
+/* 0*/ EE(CCE_ERR, handle_cce_err, "CceErr"),
+/* 1*/ EE(RCV_ERR, handle_rxe_err, "RxeErr"),
+/* 2*/ EE(MISC_ERR, handle_misc_err, "MiscErr"),
+/* 3*/ { 0, 0, 0, NULL }, /* reserved */
+/* 4*/ EE(SEND_PIO_ERR, handle_pio_err, "PioErr"),
+/* 5*/ EE(SEND_DMA_ERR, handle_sdma_err, "SDmaErr"),
+/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
+/* 7*/ EE(SEND_ERR, handle_txe_err, "TxeErr")
+ /* the rest are reserved */
+};
+
+/*
+ * Index into the Various section of the interrupt sources
+ * corresponding to the Critical Temperature interrupt.
+ */
+#define TCRIT_INT_SOURCE 4
+
+/*
+ * SDMA error interrupt entry - refers to another register containing more
+ * information.
+ */
+static const struct err_reg_info sdma_eng_err =
+ EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
+
+static const struct err_reg_info various_err[NUM_VARIOUS] = {
+/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */
+/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */
+/* 2*/ EE(ASIC_QSFP1, handle_qsfp_int, "QSFP1"),
+/* 3*/ EE(ASIC_QSFP2, handle_qsfp_int, "QSFP2"),
+/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */
+ /* rest are reserved */
+};
+
+/*
+ * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
+ * register can not be derived from the MTU value because 10K is not
+ * a power of 2. Therefore, we need a constant. Everything else can
+ * be calculated.
+ */
+#define DCC_CFG_PORT_MTU_CAP_10240 7
+
+/*
+ * Table of the DC grouping of error interrupts. Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
+/* 0*/ DC_EE1(DCC_ERR, handle_dcc_err, "DCC Err"),
+/* 1*/ DC_EE2(DC_LCB_ERR, handle_lcb_err, "LCB Err"),
+/* 2*/ DC_EE2(DC_DC8051_ERR, handle_8051_interrupt, "DC8051 Interrupt"),
+/* 3*/ /* dc_lbm_int - special, see is_dc_int() */
+ /* the rest are reserved */
+};
+
+struct cntr_entry {
+ /*
+ * counter name
+ */
+ char *name;
+
+ /*
+ * csr to read for name (if applicable)
+ */
+ u64 csr;
+
+ /*
+ * offset into dd or ppd to store the counter's value
+ */
+ int offset;
+
+ /*
+ * flags
+ */
+ u8 flags;
+
+ /*
+ * accessor for stat element, context either dd or ppd
+ */
+ u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl,
+ int mode, u64 data);
+};
+
+#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
+#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
+
+#define CNTR_ELEM(name, csr, offset, flags, accessor) \
+{ \
+ name, \
+ csr, \
+ offset, \
+ flags, \
+ accessor \
+}
+
+/* 32bit RXE */
+#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + RCV_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ port_access_u32_csr)
+
+#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + RCV_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ dev_access_u32_csr)
+
+/* 64bit RXE */
+#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + RCV_COUNTER_ARRAY64), \
+ 0, flags, \
+ port_access_u64_csr)
+
+#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + RCV_COUNTER_ARRAY64), \
+ 0, flags, \
+ dev_access_u64_csr)
+
+#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
+#define OVR_ELM(ctx) \
+CNTR_ELEM("RcvHdrOvr" #ctx, \
+ (RCV_HDR_OVFL_CNT + ctx * 0x100), \
+ 0, CNTR_NORMAL, port_access_u64_csr)
+
+/* 32bit TXE */
+#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + SEND_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ port_access_u32_csr)
+
+/* 64bit TXE */
+#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + SEND_COUNTER_ARRAY64), \
+ 0, flags, \
+ port_access_u64_csr)
+
+# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name,\
+ counter * 8 + SEND_COUNTER_ARRAY64, \
+ 0, \
+ flags, \
+ dev_access_u64_csr)
+
+/* CCE */
+#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + CCE_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ dev_access_u32_csr)
+
+#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+ (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
+ 0, flags | CNTR_32BIT, \
+ dev_access_u32_csr)
+
+/* DC */
+#define DC_PERF_CNTR(name, counter, flags) \
+CNTR_ELEM(#name, \
+ counter, \
+ 0, \
+ flags, \
+ dev_access_u64_csr)
+
+#define DC_PERF_CNTR_LCB(name, counter, flags) \
+CNTR_ELEM(#name, \
+ counter, \
+ 0, \
+ flags, \
+ dc_access_lcb_cntr)
+
+/* ibp counters */
+#define SW_IBP_CNTR(name, cntr) \
+CNTR_ELEM(#name, \
+ 0, \
+ 0, \
+ CNTR_SYNTH, \
+ access_ibp_##cntr)
+
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
+{
+ if (dd->flags & HFI1_PRESENT) {
+ return readq((void __iomem *)dd->kregbase + offset);
+ }
+ return -1;
+}
+
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
+{
+ if (dd->flags & HFI1_PRESENT)
+ writeq(value, (void __iomem *)dd->kregbase + offset);
+}
+
+void __iomem *get_csr_addr(
+ struct hfi1_devdata *dd,
+ u32 offset)
+{
+ return (void __iomem *)dd->kregbase + offset;
+}
+
+static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
+ int mode, u64 value)
+{
+ u64 ret;
+
+ if (mode == CNTR_MODE_R) {
+ ret = read_csr(dd, csr);
+ } else if (mode == CNTR_MODE_W) {
+ write_csr(dd, csr, value);
+ ret = value;
+ } else {
+ dd_dev_err(dd, "Invalid cntr register access mode");
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
+ return ret;
+}
+
+/* Dev Access */
+static u64 dev_access_u32_csr(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+ u64 csr = entry->csr;
+
+ if (entry->flags & CNTR_SDMA) {
+ if (vl == CNTR_INVALID_VL)
+ return 0;
+ csr += 0x100 * vl;
+ } else {
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ }
+ return read_write_csr(dd, csr, mode, data);
+}
+
+static u64 access_sde_err_cnt(const struct cntr_entry *entry,
+ void *context, int idx, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ if (dd->per_sdma && idx < dd->num_sdma)
+ return dd->per_sdma[idx].err_cnt;
+ return 0;
+}
+
+static u64 access_sde_int_cnt(const struct cntr_entry *entry,
+ void *context, int idx, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ if (dd->per_sdma && idx < dd->num_sdma)
+ return dd->per_sdma[idx].sdma_int_cnt;
+ return 0;
+}
+
+static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry,
+ void *context, int idx, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ if (dd->per_sdma && idx < dd->num_sdma)
+ return dd->per_sdma[idx].idle_int_cnt;
+ return 0;
+}
+
+static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry,
+ void *context, int idx, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ if (dd->per_sdma && idx < dd->num_sdma)
+ return dd->per_sdma[idx].progress_int_cnt;
+ return 0;
+}
+
+static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ u64 val = 0;
+ u64 csr = entry->csr;
+
+ if (entry->flags & CNTR_VL) {
+ if (vl == CNTR_INVALID_VL)
+ return 0;
+ csr += 8 * vl;
+ } else {
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ }
+
+ val = read_write_csr(dd, csr, mode, data);
+ return val;
+}
+
+static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+ u32 csr = entry->csr;
+ int ret = 0;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ if (mode == CNTR_MODE_R)
+ ret = read_lcb_csr(dd, csr, &data);
+ else if (mode == CNTR_MODE_W)
+ ret = write_lcb_csr(dd, csr, data);
+
+ if (ret) {
+ dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
+ return data;
+}
+
+/* Port Access */
+static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ return read_write_csr(ppd->dd, entry->csr, mode, data);
+}
+
+static u64 port_access_u64_csr(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = context;
+ u64 val;
+ u64 csr = entry->csr;
+
+ if (entry->flags & CNTR_VL) {
+ if (vl == CNTR_INVALID_VL)
+ return 0;
+ csr += 8 * vl;
+ } else {
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ }
+ val = read_write_csr(ppd->dd, csr, mode, data);
+ return val;
+}
+
+/* Software defined */
+static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
+ u64 data)
+{
+ u64 ret;
+
+ if (mode == CNTR_MODE_R) {
+ ret = *cntr;
+ } else if (mode == CNTR_MODE_W) {
+ *cntr = data;
+ ret = data;
+ } else {
+ dd_dev_err(dd, "Invalid cntr sw access mode");
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
+
+ return ret;
+}
+
+static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
+}
+
+static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
+ int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
+}
+
+static u64 access_sw_unknown_frame_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+ return read_write_sw(ppd->dd, &ppd->unknown_frame_count, mode, data);
+}
+
+static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+ u64 zero = 0;
+ u64 *counter;
+
+ if (vl == CNTR_INVALID_VL)
+ counter = &ppd->port_xmit_discards;
+ else if (vl >= 0 && vl < C_VL_COUNT)
+ counter = &ppd->port_xmit_discards_vl[vl];
+ else
+ counter = &zero;
+
+ return read_write_sw(ppd->dd, counter, mode, data);
+}
+
+static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_pportdata *ppd = context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+
+ return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
+ mode, data);
+}
+
+static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_pportdata *ppd = context;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+
+ return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
+ mode, data);
+}
+
+u64 get_all_cpu_total(u64 __percpu *cntr)
+{
+ int cpu;
+ u64 counter = 0;
+
+ for_each_possible_cpu(cpu)
+ counter += *per_cpu_ptr(cntr, cpu);
+ return counter;
+}
+
+static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
+ u64 __percpu *cntr,
+ int vl, int mode, u64 data)
+{
+ u64 ret = 0;
+
+ if (vl != CNTR_INVALID_VL)
+ return 0;
+
+ if (mode == CNTR_MODE_R) {
+ ret = get_all_cpu_total(cntr) - *z_val;
+ } else if (mode == CNTR_MODE_W) {
+ /* A write can only zero the counter */
+ if (data == 0)
+ *z_val = get_all_cpu_total(cntr);
+ else
+ dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
+ } else {
+ dd_dev_err(dd, "Invalid cntr sw cpu access mode");
+ return 0;
+ }
+
+ return ret;
+}
+
+static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
+ mode, data);
+}
+
+static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
+ mode, data);
+}
+
+static u64 access_sw_pio_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ return dd->verbs_dev.n_piowait;
+}
+
+static u64 access_sw_pio_drain(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->verbs_dev.n_piodrain;
+}
+
+static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ return dd->verbs_dev.n_txwait;
+}
+
+static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ return dd->verbs_dev.n_kmem_wait;
+}
+
+static u64 access_sw_send_schedule(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl,
+ mode, data);
+}
+
+/* Software counters for the error status bits within MISC_ERR_STATUS */
+static u64 access_misc_pll_lock_fail_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[12];
+}
+
+static u64 access_misc_mbist_fail_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[11];
+}
+
+static u64 access_misc_invalid_eep_cmd_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[10];
+}
+
+static u64 access_misc_efuse_done_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[9];
+}
+
+static u64 access_misc_efuse_write_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[8];
+}
+
+static u64 access_misc_efuse_read_bad_addr_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[7];
+}
+
+static u64 access_misc_efuse_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[6];
+}
+
+static u64 access_misc_fw_auth_failed_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[5];
+}
+
+static u64 access_misc_key_mismatch_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[4];
+}
+
+static u64 access_misc_sbus_write_failed_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[3];
+}
+
+static u64 access_misc_csr_write_bad_addr_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[2];
+}
+
+static u64 access_misc_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[1];
+}
+
+static u64 access_misc_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->misc_err_status_cnt[0];
+}
+
+/*
+ * Software counter for the aggregate of
+ * individual CceErrStatus counters
+ */
+static u64 access_sw_cce_err_status_aggregated_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_cce_err_status_aggregate;
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within CceErrStatus
+ */
+static u64 access_cce_msix_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[40];
+}
+
+static u64 access_cce_int_map_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[39];
+}
+
+static u64 access_cce_int_map_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[38];
+}
+
+static u64 access_cce_msix_table_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[37];
+}
+
+static u64 access_cce_msix_table_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[36];
+}
+
+static u64 access_cce_rxdma_conv_fifo_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[35];
+}
+
+static u64 access_cce_rcpl_async_fifo_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[34];
+}
+
+static u64 access_cce_seg_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[33];
+}
+
+static u64 access_cce_seg_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[32];
+}
+
+static u64 access_la_triggered_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[31];
+}
+
+static u64 access_cce_trgt_cpl_timeout_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[30];
+}
+
+static u64 access_pcic_receive_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[29];
+}
+
+static u64 access_pcic_transmit_back_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[28];
+}
+
+static u64 access_pcic_transmit_front_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[27];
+}
+
+static u64 access_pcic_cpl_dat_q_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[26];
+}
+
+static u64 access_pcic_cpl_hd_q_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[25];
+}
+
+static u64 access_pcic_post_dat_q_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[24];
+}
+
+static u64 access_pcic_post_hd_q_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[23];
+}
+
+static u64 access_pcic_retry_sot_mem_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[22];
+}
+
+static u64 access_pcic_retry_mem_unc_err(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[21];
+}
+
+static u64 access_pcic_n_post_dat_q_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[20];
+}
+
+static u64 access_pcic_n_post_h_q_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[19];
+}
+
+static u64 access_pcic_cpl_dat_q_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[18];
+}
+
+static u64 access_pcic_cpl_hd_q_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[17];
+}
+
+static u64 access_pcic_post_dat_q_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[16];
+}
+
+static u64 access_pcic_post_hd_q_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[15];
+}
+
+static u64 access_pcic_retry_sot_mem_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[14];
+}
+
+static u64 access_pcic_retry_mem_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[13];
+}
+
+static u64 access_cce_cli1_async_fifo_dbg_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[12];
+}
+
+static u64 access_cce_cli1_async_fifo_rxdma_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[11];
+}
+
+static u64 access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[10];
+}
+
+static u64 access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[9];
+}
+
+static u64 access_cce_cli2_async_fifo_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[8];
+}
+
+static u64 access_cce_csr_cfg_bus_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[7];
+}
+
+static u64 access_cce_cli0_async_fifo_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[6];
+}
+
+static u64 access_cce_rspd_data_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[5];
+}
+
+static u64 access_cce_trgt_access_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[4];
+}
+
+static u64 access_cce_trgt_async_fifo_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[3];
+}
+
+static u64 access_cce_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[2];
+}
+
+static u64 access_cce_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[1];
+}
+
+static u64 access_ccs_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->cce_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within RcvErrStatus
+ */
+static u64 access_rx_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[63];
+}
+
+static u64 access_rx_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[62];
+}
+
+static u64 access_rx_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[61];
+}
+
+static u64 access_rx_dma_csr_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[60];
+}
+
+static u64 access_rx_dma_dq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[59];
+}
+
+static u64 access_rx_dma_eq_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[58];
+}
+
+static u64 access_rx_dma_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[57];
+}
+
+static u64 access_rx_rbuf_data_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[56];
+}
+
+static u64 access_rx_rbuf_data_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[55];
+}
+
+static u64 access_rx_dma_data_fifo_rd_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[54];
+}
+
+static u64 access_rx_dma_data_fifo_rd_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[53];
+}
+
+static u64 access_rx_dma_hdr_fifo_rd_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[52];
+}
+
+static u64 access_rx_dma_hdr_fifo_rd_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[51];
+}
+
+static u64 access_rx_rbuf_desc_part2_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[50];
+}
+
+static u64 access_rx_rbuf_desc_part2_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[49];
+}
+
+static u64 access_rx_rbuf_desc_part1_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[48];
+}
+
+static u64 access_rx_rbuf_desc_part1_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[47];
+}
+
+static u64 access_rx_hq_intr_fsm_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[46];
+}
+
+static u64 access_rx_hq_intr_csr_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[45];
+}
+
+static u64 access_rx_lookup_csr_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[44];
+}
+
+static u64 access_rx_lookup_rcv_array_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[43];
+}
+
+static u64 access_rx_lookup_rcv_array_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[42];
+}
+
+static u64 access_rx_lookup_des_part2_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[41];
+}
+
+static u64 access_rx_lookup_des_part1_unc_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[40];
+}
+
+static u64 access_rx_lookup_des_part1_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[39];
+}
+
+static u64 access_rx_rbuf_next_free_buf_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[38];
+}
+
+static u64 access_rx_rbuf_next_free_buf_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[37];
+}
+
+static u64 access_rbuf_fl_init_wr_addr_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[36];
+}
+
+static u64 access_rx_rbuf_fl_initdone_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[35];
+}
+
+static u64 access_rx_rbuf_fl_write_addr_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[34];
+}
+
+static u64 access_rx_rbuf_fl_rd_addr_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[33];
+}
+
+static u64 access_rx_rbuf_empty_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[32];
+}
+
+static u64 access_rx_rbuf_full_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[31];
+}
+
+static u64 access_rbuf_bad_lookup_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[30];
+}
+
+static u64 access_rbuf_ctx_id_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[29];
+}
+
+static u64 access_rbuf_csr_qeopdw_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[28];
+}
+
+static u64 access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[27];
+}
+
+static u64 access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[26];
+}
+
+static u64 access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[25];
+}
+
+static u64 access_rx_rbuf_csr_q_vld_bit_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[24];
+}
+
+static u64 access_rx_rbuf_csr_q_next_buf_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[23];
+}
+
+static u64 access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[22];
+}
+
+static u64 access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[21];
+}
+
+static u64 access_rx_rbuf_block_list_read_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[20];
+}
+
+static u64 access_rx_rbuf_block_list_read_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[19];
+}
+
+static u64 access_rx_rbuf_lookup_des_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[18];
+}
+
+static u64 access_rx_rbuf_lookup_des_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[17];
+}
+
+static u64 access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[16];
+}
+
+static u64 access_rx_rbuf_lookup_des_reg_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[15];
+}
+
+static u64 access_rx_rbuf_free_list_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[14];
+}
+
+static u64 access_rx_rbuf_free_list_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[13];
+}
+
+static u64 access_rx_rcv_fsm_encoding_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[12];
+}
+
+static u64 access_rx_dma_flag_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[11];
+}
+
+static u64 access_rx_dma_flag_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[10];
+}
+
+static u64 access_rx_dc_sop_eop_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[9];
+}
+
+static u64 access_rx_rcv_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[8];
+}
+
+static u64 access_rx_rcv_qp_map_table_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[7];
+}
+
+static u64 access_rx_rcv_qp_map_table_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[6];
+}
+
+static u64 access_rx_rcv_data_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[5];
+}
+
+static u64 access_rx_rcv_data_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[4];
+}
+
+static u64 access_rx_rcv_hdr_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[3];
+}
+
+static u64 access_rx_rcv_hdr_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[2];
+}
+
+static u64 access_rx_dc_intf_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[1];
+}
+
+static u64 access_rx_dma_csr_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->rcv_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendPioErrStatus
+ */
+static u64 access_pio_pec_sop_head_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[35];
+}
+
+static u64 access_pio_pcc_sop_head_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[34];
+}
+
+static u64 access_pio_last_returned_cnt_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[33];
+}
+
+static u64 access_pio_current_free_cnt_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[32];
+}
+
+static u64 access_pio_reserved_31_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[31];
+}
+
+static u64 access_pio_reserved_30_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[30];
+}
+
+static u64 access_pio_ppmc_sop_len_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[29];
+}
+
+static u64 access_pio_ppmc_bqc_mem_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[28];
+}
+
+static u64 access_pio_vl_fifo_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[27];
+}
+
+static u64 access_pio_vlf_sop_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[26];
+}
+
+static u64 access_pio_vlf_v1_len_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[25];
+}
+
+static u64 access_pio_block_qw_count_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[24];
+}
+
+static u64 access_pio_write_qw_valid_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[23];
+}
+
+static u64 access_pio_state_machine_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[22];
+}
+
+static u64 access_pio_write_data_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[21];
+}
+
+static u64 access_pio_host_addr_mem_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[20];
+}
+
+static u64 access_pio_host_addr_mem_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[19];
+}
+
+static u64 access_pio_pkt_evict_sm_or_arb_sm_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[18];
+}
+
+static u64 access_pio_init_sm_in_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[17];
+}
+
+static u64 access_pio_ppmc_pbl_fifo_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[16];
+}
+
+static u64 access_pio_credit_ret_fifo_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[15];
+}
+
+static u64 access_pio_v1_len_mem_bank1_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[14];
+}
+
+static u64 access_pio_v1_len_mem_bank0_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[13];
+}
+
+static u64 access_pio_v1_len_mem_bank1_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[12];
+}
+
+static u64 access_pio_v1_len_mem_bank0_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[11];
+}
+
+static u64 access_pio_sm_pkt_reset_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[10];
+}
+
+static u64 access_pio_pkt_evict_fifo_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[9];
+}
+
+static u64 access_pio_sbrdctrl_crrel_fifo_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[8];
+}
+
+static u64 access_pio_sbrdctl_crrel_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[7];
+}
+
+static u64 access_pio_pec_fifo_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[6];
+}
+
+static u64 access_pio_pcc_fifo_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[5];
+}
+
+static u64 access_pio_sb_mem_fifo1_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[4];
+}
+
+static u64 access_pio_sb_mem_fifo0_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[3];
+}
+
+static u64 access_pio_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[2];
+}
+
+static u64 access_pio_write_addr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[1];
+}
+
+static u64 access_pio_write_bad_ctxt_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_pio_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendDmaErrStatus
+ */
+static u64 access_sdma_pcie_req_tracking_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_dma_err_status_cnt[3];
+}
+
+static u64 access_sdma_pcie_req_tracking_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_dma_err_status_cnt[2];
+}
+
+static u64 access_sdma_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_dma_err_status_cnt[1];
+}
+
+static u64 access_sdma_rpy_tag_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_dma_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendEgressErrStatus
+ */
+static u64 access_tx_read_pio_memory_csr_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[63];
+}
+
+static u64 access_tx_read_sdma_memory_csr_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[62];
+}
+
+static u64 access_tx_egress_fifo_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[61];
+}
+
+static u64 access_tx_read_pio_memory_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[60];
+}
+
+static u64 access_tx_read_sdma_memory_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[59];
+}
+
+static u64 access_tx_sb_hdr_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[58];
+}
+
+static u64 access_tx_credit_overrun_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[57];
+}
+
+static u64 access_tx_launch_fifo8_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[56];
+}
+
+static u64 access_tx_launch_fifo7_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[55];
+}
+
+static u64 access_tx_launch_fifo6_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[54];
+}
+
+static u64 access_tx_launch_fifo5_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[53];
+}
+
+static u64 access_tx_launch_fifo4_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[52];
+}
+
+static u64 access_tx_launch_fifo3_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[51];
+}
+
+static u64 access_tx_launch_fifo2_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[50];
+}
+
+static u64 access_tx_launch_fifo1_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[49];
+}
+
+static u64 access_tx_launch_fifo0_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[48];
+}
+
+static u64 access_tx_credit_return_vl_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[47];
+}
+
+static u64 access_tx_hcrc_insertion_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[46];
+}
+
+static u64 access_tx_egress_fifo_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[45];
+}
+
+static u64 access_tx_read_pio_memory_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[44];
+}
+
+static u64 access_tx_read_sdma_memory_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[43];
+}
+
+static u64 access_tx_sb_hdr_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[42];
+}
+
+static u64 access_tx_credit_return_partiy_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[41];
+}
+
+static u64 access_tx_launch_fifo8_unc_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[40];
+}
+
+static u64 access_tx_launch_fifo7_unc_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[39];
+}
+
+static u64 access_tx_launch_fifo6_unc_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[38];
+}
+
+static u64 access_tx_launch_fifo5_unc_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[37];
+}
+
+static u64 access_tx_launch_fifo4_unc_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[36];
+}
+
+static u64 access_tx_launch_fifo3_unc_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[35];
+}
+
+static u64 access_tx_launch_fifo2_unc_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[34];
+}
+
+static u64 access_tx_launch_fifo1_unc_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[33];
+}
+
+static u64 access_tx_launch_fifo0_unc_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[32];
+}
+
+static u64 access_tx_sdma15_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[31];
+}
+
+static u64 access_tx_sdma14_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[30];
+}
+
+static u64 access_tx_sdma13_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[29];
+}
+
+static u64 access_tx_sdma12_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[28];
+}
+
+static u64 access_tx_sdma11_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[27];
+}
+
+static u64 access_tx_sdma10_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[26];
+}
+
+static u64 access_tx_sdma9_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[25];
+}
+
+static u64 access_tx_sdma8_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[24];
+}
+
+static u64 access_tx_sdma7_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[23];
+}
+
+static u64 access_tx_sdma6_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[22];
+}
+
+static u64 access_tx_sdma5_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[21];
+}
+
+static u64 access_tx_sdma4_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[20];
+}
+
+static u64 access_tx_sdma3_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[19];
+}
+
+static u64 access_tx_sdma2_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[18];
+}
+
+static u64 access_tx_sdma1_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[17];
+}
+
+static u64 access_tx_sdma0_disallowed_packet_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[16];
+}
+
+static u64 access_tx_config_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[15];
+}
+
+static u64 access_tx_sbrd_ctl_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[14];
+}
+
+static u64 access_tx_launch_csr_parity_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[13];
+}
+
+static u64 access_tx_illegal_vl_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[12];
+}
+
+static u64 access_tx_sbrd_ctl_state_machine_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[11];
+}
+
+static u64 access_egress_reserved_10_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[10];
+}
+
+static u64 access_egress_reserved_9_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[9];
+}
+
+static u64 access_tx_sdma_launch_intf_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[8];
+}
+
+static u64 access_tx_pio_launch_intf_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[7];
+}
+
+static u64 access_egress_reserved_6_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[6];
+}
+
+static u64 access_tx_incorrect_link_state_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[5];
+}
+
+static u64 access_tx_linkdown_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[4];
+}
+
+static u64 access_tx_egress_fifi_underrun_or_parity_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[3];
+}
+
+static u64 access_egress_reserved_2_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[2];
+}
+
+static u64 access_tx_pkt_integrity_mem_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[1];
+}
+
+static u64 access_tx_pkt_integrity_mem_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_egress_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendErrStatus
+ */
+static u64 access_send_csr_write_bad_addr_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_err_status_cnt[2];
+}
+
+static u64 access_send_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_err_status_cnt[1];
+}
+
+static u64 access_send_csr_parity_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->send_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendCtxtErrStatus
+ */
+static u64 access_pio_write_out_of_bounds_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_ctxt_err_status_cnt[4];
+}
+
+static u64 access_pio_write_overflow_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_ctxt_err_status_cnt[3];
+}
+
+static u64 access_pio_write_crosses_boundary_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_ctxt_err_status_cnt[2];
+}
+
+static u64 access_pio_disallowed_packet_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_ctxt_err_status_cnt[1];
+}
+
+static u64 access_pio_inconsistent_sop_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_ctxt_err_status_cnt[0];
+}
+
+/*
+ * Software counters corresponding to each of the
+ * error status bits within SendDmaEngErrStatus
+ */
+static u64 access_sdma_header_request_fifo_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[23];
+}
+
+static u64 access_sdma_header_storage_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[22];
+}
+
+static u64 access_sdma_packet_tracking_cor_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[21];
+}
+
+static u64 access_sdma_assembly_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[20];
+}
+
+static u64 access_sdma_desc_table_cor_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[19];
+}
+
+static u64 access_sdma_header_request_fifo_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[18];
+}
+
+static u64 access_sdma_header_storage_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[17];
+}
+
+static u64 access_sdma_packet_tracking_unc_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[16];
+}
+
+static u64 access_sdma_assembly_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[15];
+}
+
+static u64 access_sdma_desc_table_unc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[14];
+}
+
+static u64 access_sdma_timeout_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[13];
+}
+
+static u64 access_sdma_header_length_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[12];
+}
+
+static u64 access_sdma_header_address_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[11];
+}
+
+static u64 access_sdma_header_select_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[10];
+}
+
+static u64 access_sdma_reserved_9_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[9];
+}
+
+static u64 access_sdma_packet_desc_overflow_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[8];
+}
+
+static u64 access_sdma_length_mismatch_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl,
+ int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[7];
+}
+
+static u64 access_sdma_halt_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[6];
+}
+
+static u64 access_sdma_mem_read_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[5];
+}
+
+static u64 access_sdma_first_desc_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[4];
+}
+
+static u64 access_sdma_tail_out_of_bounds_err_cnt(
+ const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[3];
+}
+
+static u64 access_sdma_too_long_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[2];
+}
+
+static u64 access_sdma_gen_mismatch_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[1];
+}
+
+static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ return dd->sw_send_dma_eng_err_status_cnt[0];
+}
+
+static u64 access_dc_rcv_err_cnt(const struct cntr_entry *entry,
+ void *context, int vl, int mode,
+ u64 data)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+ u64 val = 0;
+ u64 csr = entry->csr;
+
+ val = read_write_csr(dd, csr, mode, data);
+ if (mode == CNTR_MODE_R) {
+ val = val > CNTR_MAX - dd->sw_rcv_bypass_packet_errors ?
+ CNTR_MAX : val + dd->sw_rcv_bypass_packet_errors;
+ } else if (mode == CNTR_MODE_W) {
+ dd->sw_rcv_bypass_packet_errors = 0;
+ } else {
+ dd_dev_err(dd, "Invalid cntr register access mode");
+ return 0;
+ }
+ return val;
+}
+
+#define def_access_sw_cpu(cntr) \
+static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry, \
+ void *context, int vl, int mode, u64 data) \
+{ \
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \
+ return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr, \
+ ppd->ibport_data.rvp.cntr, vl, \
+ mode, data); \
+}
+
+def_access_sw_cpu(rc_acks);
+def_access_sw_cpu(rc_qacks);
+def_access_sw_cpu(rc_delayed_comp);
+
+#define def_access_ibp_counter(cntr) \
+static u64 access_ibp_##cntr(const struct cntr_entry *entry, \
+ void *context, int vl, int mode, u64 data) \
+{ \
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \
+ \
+ if (vl != CNTR_INVALID_VL) \
+ return 0; \
+ \
+ return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr, \
+ mode, data); \
+}
+
+def_access_ibp_counter(loop_pkts);
+def_access_ibp_counter(rc_resends);
+def_access_ibp_counter(rnr_naks);
+def_access_ibp_counter(other_naks);
+def_access_ibp_counter(rc_timeouts);
+def_access_ibp_counter(pkt_drops);
+def_access_ibp_counter(dmawait);
+def_access_ibp_counter(rc_seqnak);
+def_access_ibp_counter(rc_dupreq);
+def_access_ibp_counter(rdma_seq);
+def_access_ibp_counter(unaligned);
+def_access_ibp_counter(seq_naks);
+
+static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
+[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
+ CNTR_NORMAL),
+[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
+ CNTR_NORMAL),
+[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
+ RCV_TID_FLOW_GEN_MISMATCH_CNT,
+ CNTR_NORMAL),
+[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
+ CNTR_NORMAL),
+[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
+ RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
+[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
+ CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
+[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
+ CNTR_NORMAL),
+[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
+ CNTR_NORMAL),
+[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
+ CNTR_NORMAL),
+[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
+ CNTR_NORMAL),
+[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
+ CNTR_NORMAL),
+[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
+ CNTR_NORMAL),
+[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
+ CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL),
+[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
+ CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
+[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_ERR] = CNTR_ELEM("DcRecvErr", DCC_ERR_PORTRCV_ERR_CNT, 0, CNTR_SYNTH,
+ access_dc_rcv_err_cnt),
+[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
+ CNTR_SYNTH),
+[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
+ CNTR_SYNTH),
+[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
+ CNTR_SYNTH),
+[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
+ DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
+[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
+ DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
+ CNTR_SYNTH),
+[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
+ DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
+[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
+ CNTR_SYNTH),
+[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
+ CNTR_SYNTH),
+[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
+ CNTR_SYNTH),
+[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
+ CNTR_SYNTH),
+[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
+ CNTR_SYNTH),
+[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
+ CNTR_SYNTH),
+[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_DC_TOTAL_CRC] =
+ DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
+ CNTR_SYNTH),
+[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
+ CNTR_SYNTH),
+[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
+ CNTR_SYNTH),
+[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
+ CNTR_SYNTH),
+[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
+ CNTR_SYNTH),
+[C_DC_CRC_MULT_LN] =
+ DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
+ CNTR_SYNTH),
+[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
+ CNTR_SYNTH),
+[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
+ CNTR_SYNTH),
+[C_DC_SEQ_CRC_CNT] =
+ DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
+ CNTR_SYNTH),
+[C_DC_ESC0_ONLY_CNT] =
+ DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
+ CNTR_SYNTH),
+[C_DC_ESC0_PLUS1_CNT] =
+ DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
+ CNTR_SYNTH),
+[C_DC_ESC0_PLUS2_CNT] =
+ DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
+ CNTR_SYNTH),
+[C_DC_REINIT_FROM_PEER_CNT] =
+ DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
+ CNTR_SYNTH),
+[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
+ CNTR_SYNTH),
+[C_DC_MISC_FLG_CNT] =
+ DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
+ CNTR_SYNTH),
+[C_DC_PRF_GOOD_LTP_CNT] =
+ DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
+[C_DC_PRF_ACCEPTED_LTP_CNT] =
+ DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
+ CNTR_SYNTH),
+[C_DC_PRF_RX_FLIT_CNT] =
+ DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_TX_FLIT_CNT] =
+ DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_CLK_CNTR] =
+ DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
+[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
+ DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
+ DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
+ CNTR_SYNTH),
+[C_DC_PG_STS_TX_SBE_CNT] =
+ DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_TX_MBE_CNT] =
+ DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
+ CNTR_SYNTH),
+[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_intr),
+[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_rcv_limit),
+[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
+ access_sw_vtx_wait),
+[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
+ access_sw_pio_wait),
+[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL,
+ access_sw_pio_drain),
+[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
+ access_sw_kmem_wait),
+[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
+ access_sw_send_schedule),
+[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
+ SEND_DMA_DESC_FETCHED_CNT, 0,
+ CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+ dev_access_u32_csr),
+[C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0,
+ CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+ access_sde_int_cnt),
+[C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0,
+ CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+ access_sde_err_cnt),
+[C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0,
+ CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+ access_sde_idle_int_cnt),
+[C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0,
+ CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA,
+ access_sde_progress_int_cnt),
+/* MISC_ERR_STATUS */
+[C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_pll_lock_fail_err_cnt),
+[C_MISC_MBIST_FAIL_ERR] = CNTR_ELEM("MISC_MBIST_FAIL_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_mbist_fail_err_cnt),
+[C_MISC_INVALID_EEP_CMD_ERR] = CNTR_ELEM("MISC_INVALID_EEP_CMD_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_invalid_eep_cmd_err_cnt),
+[C_MISC_EFUSE_DONE_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_DONE_PARITY_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_efuse_done_parity_err_cnt),
+[C_MISC_EFUSE_WRITE_ERR] = CNTR_ELEM("MISC_EFUSE_WRITE_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_efuse_write_err_cnt),
+[C_MISC_EFUSE_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_EFUSE_READ_BAD_ADDR_ERR", 0,
+ 0, CNTR_NORMAL,
+ access_misc_efuse_read_bad_addr_err_cnt),
+[C_MISC_EFUSE_CSR_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_CSR_PARITY_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_efuse_csr_parity_err_cnt),
+[C_MISC_FW_AUTH_FAILED_ERR] = CNTR_ELEM("MISC_FW_AUTH_FAILED_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_fw_auth_failed_err_cnt),
+[C_MISC_KEY_MISMATCH_ERR] = CNTR_ELEM("MISC_KEY_MISMATCH_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_key_mismatch_err_cnt),
+[C_MISC_SBUS_WRITE_FAILED_ERR] = CNTR_ELEM("MISC_SBUS_WRITE_FAILED_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_sbus_write_failed_err_cnt),
+[C_MISC_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_WRITE_BAD_ADDR_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_csr_write_bad_addr_err_cnt),
+[C_MISC_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_READ_BAD_ADDR_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_csr_read_bad_addr_err_cnt),
+[C_MISC_CSR_PARITY_ERR] = CNTR_ELEM("MISC_CSR_PARITY_ERR", 0, 0,
+ CNTR_NORMAL,
+ access_misc_csr_parity_err_cnt),
+/* CceErrStatus */
+[C_CCE_ERR_STATUS_AGGREGATED_CNT] = CNTR_ELEM("CceErrStatusAggregatedCnt", 0, 0,
+ CNTR_NORMAL,
+ access_sw_cce_err_status_aggregated_cnt),
+[C_CCE_MSIX_CSR_PARITY_ERR] = CNTR_ELEM("CceMsixCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_msix_csr_parity_err_cnt),
+[C_CCE_INT_MAP_UNC_ERR] = CNTR_ELEM("CceIntMapUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_int_map_unc_err_cnt),
+[C_CCE_INT_MAP_COR_ERR] = CNTR_ELEM("CceIntMapCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_int_map_cor_err_cnt),
+[C_CCE_MSIX_TABLE_UNC_ERR] = CNTR_ELEM("CceMsixTableUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_msix_table_unc_err_cnt),
+[C_CCE_MSIX_TABLE_COR_ERR] = CNTR_ELEM("CceMsixTableCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_msix_table_cor_err_cnt),
+[C_CCE_RXDMA_CONV_FIFO_PARITY_ERR] = CNTR_ELEM("CceRxdmaConvFifoParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_cce_rxdma_conv_fifo_parity_err_cnt),
+[C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceRcplAsyncFifoParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_cce_rcpl_async_fifo_parity_err_cnt),
+[C_CCE_SEG_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceSegWriteBadAddrErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_seg_write_bad_addr_err_cnt),
+[C_CCE_SEG_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceSegReadBadAddrErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_seg_read_bad_addr_err_cnt),
+[C_LA_TRIGGERED] = CNTR_ELEM("Cce LATriggered", 0, 0,
+ CNTR_NORMAL,
+ access_la_triggered_cnt),
+[C_CCE_TRGT_CPL_TIMEOUT_ERR] = CNTR_ELEM("CceTrgtCplTimeoutErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_trgt_cpl_timeout_err_cnt),
+[C_PCIC_RECEIVE_PARITY_ERR] = CNTR_ELEM("PcicReceiveParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_receive_parity_err_cnt),
+[C_PCIC_TRANSMIT_BACK_PARITY_ERR] = CNTR_ELEM("PcicTransmitBackParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_transmit_back_parity_err_cnt),
+[C_PCIC_TRANSMIT_FRONT_PARITY_ERR] = CNTR_ELEM("PcicTransmitFrontParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_pcic_transmit_front_parity_err_cnt),
+[C_PCIC_CPL_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicCplDatQUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_cpl_dat_q_unc_err_cnt),
+[C_PCIC_CPL_HD_Q_UNC_ERR] = CNTR_ELEM("PcicCplHdQUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_cpl_hd_q_unc_err_cnt),
+[C_PCIC_POST_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicPostDatQUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_post_dat_q_unc_err_cnt),
+[C_PCIC_POST_HD_Q_UNC_ERR] = CNTR_ELEM("PcicPostHdQUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_post_hd_q_unc_err_cnt),
+[C_PCIC_RETRY_SOT_MEM_UNC_ERR] = CNTR_ELEM("PcicRetrySotMemUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_retry_sot_mem_unc_err_cnt),
+[C_PCIC_RETRY_MEM_UNC_ERR] = CNTR_ELEM("PcicRetryMemUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_retry_mem_unc_err),
+[C_PCIC_N_POST_DAT_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostDatQParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_n_post_dat_q_parity_err_cnt),
+[C_PCIC_N_POST_H_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostHQParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_n_post_h_q_parity_err_cnt),
+[C_PCIC_CPL_DAT_Q_COR_ERR] = CNTR_ELEM("PcicCplDatQCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_cpl_dat_q_cor_err_cnt),
+[C_PCIC_CPL_HD_Q_COR_ERR] = CNTR_ELEM("PcicCplHdQCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_cpl_hd_q_cor_err_cnt),
+[C_PCIC_POST_DAT_Q_COR_ERR] = CNTR_ELEM("PcicPostDatQCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_post_dat_q_cor_err_cnt),
+[C_PCIC_POST_HD_Q_COR_ERR] = CNTR_ELEM("PcicPostHdQCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_post_hd_q_cor_err_cnt),
+[C_PCIC_RETRY_SOT_MEM_COR_ERR] = CNTR_ELEM("PcicRetrySotMemCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_retry_sot_mem_cor_err_cnt),
+[C_PCIC_RETRY_MEM_COR_ERR] = CNTR_ELEM("PcicRetryMemCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_pcic_retry_mem_cor_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR] = CNTR_ELEM(
+ "CceCli1AsyncFifoDbgParityError", 0, 0,
+ CNTR_NORMAL,
+ access_cce_cli1_async_fifo_dbg_parity_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR] = CNTR_ELEM(
+ "CceCli1AsyncFifoRxdmaParityError", 0, 0,
+ CNTR_NORMAL,
+ access_cce_cli1_async_fifo_rxdma_parity_err_cnt
+ ),
+[C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR] = CNTR_ELEM(
+ "CceCli1AsyncFifoSdmaHdParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt),
+[C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR] = CNTR_ELEM(
+ "CceCli1AsyncFifoPioCrdtParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt),
+[C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceCli2AsyncFifoParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_cce_cli2_async_fifo_parity_err_cnt),
+[C_CCE_CSR_CFG_BUS_PARITY_ERR] = CNTR_ELEM("CceCsrCfgBusParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_csr_cfg_bus_parity_err_cnt),
+[C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR] = CNTR_ELEM("CceCli0AsyncFifoParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_cce_cli0_async_fifo_parity_err_cnt),
+[C_CCE_RSPD_DATA_PARITY_ERR] = CNTR_ELEM("CceRspdDataParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_rspd_data_parity_err_cnt),
+[C_CCE_TRGT_ACCESS_ERR] = CNTR_ELEM("CceTrgtAccessErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_trgt_access_err_cnt),
+[C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceTrgtAsyncFifoParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_cce_trgt_async_fifo_parity_err_cnt),
+[C_CCE_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrWriteBadAddrErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_csr_write_bad_addr_err_cnt),
+[C_CCE_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrReadBadAddrErr", 0, 0,
+ CNTR_NORMAL,
+ access_cce_csr_read_bad_addr_err_cnt),
+[C_CCE_CSR_PARITY_ERR] = CNTR_ELEM("CceCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_ccs_csr_parity_err_cnt),
+
+/* RcvErrStatus */
+[C_RX_CSR_PARITY_ERR] = CNTR_ELEM("RxCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_csr_parity_err_cnt),
+[C_RX_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrWriteBadAddrErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_csr_write_bad_addr_err_cnt),
+[C_RX_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrReadBadAddrErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_csr_read_bad_addr_err_cnt),
+[C_RX_DMA_CSR_UNC_ERR] = CNTR_ELEM("RxDmaCsrUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_csr_unc_err_cnt),
+[C_RX_DMA_DQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaDqFsmEncodingErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_dq_fsm_encoding_err_cnt),
+[C_RX_DMA_EQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaEqFsmEncodingErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_eq_fsm_encoding_err_cnt),
+[C_RX_DMA_CSR_PARITY_ERR] = CNTR_ELEM("RxDmaCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_csr_parity_err_cnt),
+[C_RX_RBUF_DATA_COR_ERR] = CNTR_ELEM("RxRbufDataCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_data_cor_err_cnt),
+[C_RX_RBUF_DATA_UNC_ERR] = CNTR_ELEM("RxRbufDataUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_data_unc_err_cnt),
+[C_RX_DMA_DATA_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaDataFifoRdCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_data_fifo_rd_cor_err_cnt),
+[C_RX_DMA_DATA_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaDataFifoRdUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_data_fifo_rd_unc_err_cnt),
+[C_RX_DMA_HDR_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaHdrFifoRdCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_hdr_fifo_rd_cor_err_cnt),
+[C_RX_DMA_HDR_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaHdrFifoRdUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_hdr_fifo_rd_unc_err_cnt),
+[C_RX_RBUF_DESC_PART2_COR_ERR] = CNTR_ELEM("RxRbufDescPart2CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_desc_part2_cor_err_cnt),
+[C_RX_RBUF_DESC_PART2_UNC_ERR] = CNTR_ELEM("RxRbufDescPart2UncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_desc_part2_unc_err_cnt),
+[C_RX_RBUF_DESC_PART1_COR_ERR] = CNTR_ELEM("RxRbufDescPart1CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_desc_part1_cor_err_cnt),
+[C_RX_RBUF_DESC_PART1_UNC_ERR] = CNTR_ELEM("RxRbufDescPart1UncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_desc_part1_unc_err_cnt),
+[C_RX_HQ_INTR_FSM_ERR] = CNTR_ELEM("RxHqIntrFsmErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_hq_intr_fsm_err_cnt),
+[C_RX_HQ_INTR_CSR_PARITY_ERR] = CNTR_ELEM("RxHqIntrCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_hq_intr_csr_parity_err_cnt),
+[C_RX_LOOKUP_CSR_PARITY_ERR] = CNTR_ELEM("RxLookupCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_lookup_csr_parity_err_cnt),
+[C_RX_LOOKUP_RCV_ARRAY_COR_ERR] = CNTR_ELEM("RxLookupRcvArrayCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_lookup_rcv_array_cor_err_cnt),
+[C_RX_LOOKUP_RCV_ARRAY_UNC_ERR] = CNTR_ELEM("RxLookupRcvArrayUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_lookup_rcv_array_unc_err_cnt),
+[C_RX_LOOKUP_DES_PART2_PARITY_ERR] = CNTR_ELEM("RxLookupDesPart2ParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_rx_lookup_des_part2_parity_err_cnt),
+[C_RX_LOOKUP_DES_PART1_UNC_COR_ERR] = CNTR_ELEM("RxLookupDesPart1UncCorErr", 0,
+ 0, CNTR_NORMAL,
+ access_rx_lookup_des_part1_unc_cor_err_cnt),
+[C_RX_LOOKUP_DES_PART1_UNC_ERR] = CNTR_ELEM("RxLookupDesPart1UncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_lookup_des_part1_unc_err_cnt),
+[C_RX_RBUF_NEXT_FREE_BUF_COR_ERR] = CNTR_ELEM("RxRbufNextFreeBufCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_next_free_buf_cor_err_cnt),
+[C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR] = CNTR_ELEM("RxRbufNextFreeBufUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_next_free_buf_unc_err_cnt),
+[C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR] = CNTR_ELEM(
+ "RxRbufFlInitWrAddrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rbuf_fl_init_wr_addr_parity_err_cnt),
+[C_RX_RBUF_FL_INITDONE_PARITY_ERR] = CNTR_ELEM("RxRbufFlInitdoneParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_rx_rbuf_fl_initdone_parity_err_cnt),
+[C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlWrAddrParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_rx_rbuf_fl_write_addr_parity_err_cnt),
+[C_RX_RBUF_FL_RD_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlRdAddrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_fl_rd_addr_parity_err_cnt),
+[C_RX_RBUF_EMPTY_ERR] = CNTR_ELEM("RxRbufEmptyErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_empty_err_cnt),
+[C_RX_RBUF_FULL_ERR] = CNTR_ELEM("RxRbufFullErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_full_err_cnt),
+[C_RX_RBUF_BAD_LOOKUP_ERR] = CNTR_ELEM("RxRBufBadLookupErr", 0, 0,
+ CNTR_NORMAL,
+ access_rbuf_bad_lookup_err_cnt),
+[C_RX_RBUF_CTX_ID_PARITY_ERR] = CNTR_ELEM("RxRbufCtxIdParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rbuf_ctx_id_parity_err_cnt),
+[C_RX_RBUF_CSR_QEOPDW_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEOPDWParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rbuf_csr_qeopdw_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR] = CNTR_ELEM(
+ "RxRbufCsrQNumOfPktParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR] = CNTR_ELEM(
+ "RxRbufCsrQTlPtrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQHdPtrParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQVldBitParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_rx_rbuf_csr_q_vld_bit_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQNextBufParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_rx_rbuf_csr_q_next_buf_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEntCntParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt),
+[C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR] = CNTR_ELEM(
+ "RxRbufCsrQHeadBufNumParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt),
+[C_RX_RBUF_BLOCK_LIST_READ_COR_ERR] = CNTR_ELEM("RxRbufBlockListReadCorErr", 0,
+ 0, CNTR_NORMAL,
+ access_rx_rbuf_block_list_read_cor_err_cnt),
+[C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR] = CNTR_ELEM("RxRbufBlockListReadUncErr", 0,
+ 0, CNTR_NORMAL,
+ access_rx_rbuf_block_list_read_unc_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_COR_ERR] = CNTR_ELEM("RxRbufLookupDesCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_lookup_des_cor_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_lookup_des_unc_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR] = CNTR_ELEM(
+ "RxRbufLookupDesRegUncCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt),
+[C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesRegUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_lookup_des_reg_unc_err_cnt),
+[C_RX_RBUF_FREE_LIST_COR_ERR] = CNTR_ELEM("RxRbufFreeListCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_free_list_cor_err_cnt),
+[C_RX_RBUF_FREE_LIST_UNC_ERR] = CNTR_ELEM("RxRbufFreeListUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rbuf_free_list_unc_err_cnt),
+[C_RX_RCV_FSM_ENCODING_ERR] = CNTR_ELEM("RxRcvFsmEncodingErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rcv_fsm_encoding_err_cnt),
+[C_RX_DMA_FLAG_COR_ERR] = CNTR_ELEM("RxDmaFlagCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_flag_cor_err_cnt),
+[C_RX_DMA_FLAG_UNC_ERR] = CNTR_ELEM("RxDmaFlagUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_flag_unc_err_cnt),
+[C_RX_DC_SOP_EOP_PARITY_ERR] = CNTR_ELEM("RxDcSopEopParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dc_sop_eop_parity_err_cnt),
+[C_RX_RCV_CSR_PARITY_ERR] = CNTR_ELEM("RxRcvCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rcv_csr_parity_err_cnt),
+[C_RX_RCV_QP_MAP_TABLE_COR_ERR] = CNTR_ELEM("RxRcvQpMapTableCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rcv_qp_map_table_cor_err_cnt),
+[C_RX_RCV_QP_MAP_TABLE_UNC_ERR] = CNTR_ELEM("RxRcvQpMapTableUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rcv_qp_map_table_unc_err_cnt),
+[C_RX_RCV_DATA_COR_ERR] = CNTR_ELEM("RxRcvDataCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rcv_data_cor_err_cnt),
+[C_RX_RCV_DATA_UNC_ERR] = CNTR_ELEM("RxRcvDataUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rcv_data_unc_err_cnt),
+[C_RX_RCV_HDR_COR_ERR] = CNTR_ELEM("RxRcvHdrCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rcv_hdr_cor_err_cnt),
+[C_RX_RCV_HDR_UNC_ERR] = CNTR_ELEM("RxRcvHdrUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_rcv_hdr_unc_err_cnt),
+[C_RX_DC_INTF_PARITY_ERR] = CNTR_ELEM("RxDcIntfParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dc_intf_parity_err_cnt),
+[C_RX_DMA_CSR_COR_ERR] = CNTR_ELEM("RxDmaCsrCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_rx_dma_csr_cor_err_cnt),
+/* SendPioErrStatus */
+[C_PIO_PEC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPecSopHeadParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_pec_sop_head_parity_err_cnt),
+[C_PIO_PCC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPccSopHeadParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_pcc_sop_head_parity_err_cnt),
+[C_PIO_LAST_RETURNED_CNT_PARITY_ERR] = CNTR_ELEM("PioLastReturnedCntParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_pio_last_returned_cnt_parity_err_cnt),
+[C_PIO_CURRENT_FREE_CNT_PARITY_ERR] = CNTR_ELEM("PioCurrentFreeCntParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_pio_current_free_cnt_parity_err_cnt),
+[C_PIO_RSVD_31_ERR] = CNTR_ELEM("Pio Reserved 31", 0, 0,
+ CNTR_NORMAL,
+ access_pio_reserved_31_err_cnt),
+[C_PIO_RSVD_30_ERR] = CNTR_ELEM("Pio Reserved 30", 0, 0,
+ CNTR_NORMAL,
+ access_pio_reserved_30_err_cnt),
+[C_PIO_PPMC_SOP_LEN_ERR] = CNTR_ELEM("PioPpmcSopLenErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_ppmc_sop_len_err_cnt),
+[C_PIO_PPMC_BQC_MEM_PARITY_ERR] = CNTR_ELEM("PioPpmcBqcMemParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_ppmc_bqc_mem_parity_err_cnt),
+[C_PIO_VL_FIFO_PARITY_ERR] = CNTR_ELEM("PioVlFifoParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_vl_fifo_parity_err_cnt),
+[C_PIO_VLF_SOP_PARITY_ERR] = CNTR_ELEM("PioVlfSopParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_vlf_sop_parity_err_cnt),
+[C_PIO_VLF_V1_LEN_PARITY_ERR] = CNTR_ELEM("PioVlfVlLenParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_vlf_v1_len_parity_err_cnt),
+[C_PIO_BLOCK_QW_COUNT_PARITY_ERR] = CNTR_ELEM("PioBlockQwCountParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_block_qw_count_parity_err_cnt),
+[C_PIO_WRITE_QW_VALID_PARITY_ERR] = CNTR_ELEM("PioWriteQwValidParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_write_qw_valid_parity_err_cnt),
+[C_PIO_STATE_MACHINE_ERR] = CNTR_ELEM("PioStateMachineErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_state_machine_err_cnt),
+[C_PIO_WRITE_DATA_PARITY_ERR] = CNTR_ELEM("PioWriteDataParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_write_data_parity_err_cnt),
+[C_PIO_HOST_ADDR_MEM_COR_ERR] = CNTR_ELEM("PioHostAddrMemCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_host_addr_mem_cor_err_cnt),
+[C_PIO_HOST_ADDR_MEM_UNC_ERR] = CNTR_ELEM("PioHostAddrMemUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_host_addr_mem_unc_err_cnt),
+[C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR] = CNTR_ELEM("PioPktEvictSmOrArbSmErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_pkt_evict_sm_or_arb_sm_err_cnt),
+[C_PIO_INIT_SM_IN_ERR] = CNTR_ELEM("PioInitSmInErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_init_sm_in_err_cnt),
+[C_PIO_PPMC_PBL_FIFO_ERR] = CNTR_ELEM("PioPpmcPblFifoErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_ppmc_pbl_fifo_err_cnt),
+[C_PIO_CREDIT_RET_FIFO_PARITY_ERR] = CNTR_ELEM("PioCreditRetFifoParityErr", 0,
+ 0, CNTR_NORMAL,
+ access_pio_credit_ret_fifo_parity_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK1_COR_ERR] = CNTR_ELEM("PioVlLenMemBank1CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_v1_len_mem_bank1_cor_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK0_COR_ERR] = CNTR_ELEM("PioVlLenMemBank0CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_v1_len_mem_bank0_cor_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK1_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank1UncErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_v1_len_mem_bank1_unc_err_cnt),
+[C_PIO_V1_LEN_MEM_BANK0_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank0UncErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_v1_len_mem_bank0_unc_err_cnt),
+[C_PIO_SM_PKT_RESET_PARITY_ERR] = CNTR_ELEM("PioSmPktResetParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_sm_pkt_reset_parity_err_cnt),
+[C_PIO_PKT_EVICT_FIFO_PARITY_ERR] = CNTR_ELEM("PioPktEvictFifoParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_pkt_evict_fifo_parity_err_cnt),
+[C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR] = CNTR_ELEM(
+ "PioSbrdctrlCrrelFifoParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_sbrdctrl_crrel_fifo_parity_err_cnt),
+[C_PIO_SBRDCTL_CRREL_PARITY_ERR] = CNTR_ELEM("PioSbrdctlCrrelParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_sbrdctl_crrel_parity_err_cnt),
+[C_PIO_PEC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPecFifoParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_pec_fifo_parity_err_cnt),
+[C_PIO_PCC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPccFifoParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_pcc_fifo_parity_err_cnt),
+[C_PIO_SB_MEM_FIFO1_ERR] = CNTR_ELEM("PioSbMemFifo1Err", 0, 0,
+ CNTR_NORMAL,
+ access_pio_sb_mem_fifo1_err_cnt),
+[C_PIO_SB_MEM_FIFO0_ERR] = CNTR_ELEM("PioSbMemFifo0Err", 0, 0,
+ CNTR_NORMAL,
+ access_pio_sb_mem_fifo0_err_cnt),
+[C_PIO_CSR_PARITY_ERR] = CNTR_ELEM("PioCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_csr_parity_err_cnt),
+[C_PIO_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("PioWriteAddrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_write_addr_parity_err_cnt),
+[C_PIO_WRITE_BAD_CTXT_ERR] = CNTR_ELEM("PioWriteBadCtxtErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_write_bad_ctxt_err_cnt),
+/* SendDmaErrStatus */
+[C_SDMA_PCIE_REQ_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPcieReqTrackingCorErr", 0,
+ 0, CNTR_NORMAL,
+ access_sdma_pcie_req_tracking_cor_err_cnt),
+[C_SDMA_PCIE_REQ_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPcieReqTrackingUncErr", 0,
+ 0, CNTR_NORMAL,
+ access_sdma_pcie_req_tracking_unc_err_cnt),
+[C_SDMA_CSR_PARITY_ERR] = CNTR_ELEM("SDmaCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_csr_parity_err_cnt),
+[C_SDMA_RPY_TAG_ERR] = CNTR_ELEM("SDmaRpyTagErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_rpy_tag_err_cnt),
+/* SendEgressErrStatus */
+[C_TX_READ_PIO_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryCsrUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_read_pio_memory_csr_unc_err_cnt),
+[C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryCsrUncErr", 0,
+ 0, CNTR_NORMAL,
+ access_tx_read_sdma_memory_csr_err_cnt),
+[C_TX_EGRESS_FIFO_COR_ERR] = CNTR_ELEM("TxEgressFifoCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_egress_fifo_cor_err_cnt),
+[C_TX_READ_PIO_MEMORY_COR_ERR] = CNTR_ELEM("TxReadPioMemoryCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_read_pio_memory_cor_err_cnt),
+[C_TX_READ_SDMA_MEMORY_COR_ERR] = CNTR_ELEM("TxReadSdmaMemoryCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_read_sdma_memory_cor_err_cnt),
+[C_TX_SB_HDR_COR_ERR] = CNTR_ELEM("TxSbHdrCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_sb_hdr_cor_err_cnt),
+[C_TX_CREDIT_OVERRUN_ERR] = CNTR_ELEM("TxCreditOverrunErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_credit_overrun_err_cnt),
+[C_TX_LAUNCH_FIFO8_COR_ERR] = CNTR_ELEM("TxLaunchFifo8CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_fifo8_cor_err_cnt),
+[C_TX_LAUNCH_FIFO7_COR_ERR] = CNTR_ELEM("TxLaunchFifo7CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_fifo7_cor_err_cnt),
+[C_TX_LAUNCH_FIFO6_COR_ERR] = CNTR_ELEM("TxLaunchFifo6CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_fifo6_cor_err_cnt),
+[C_TX_LAUNCH_FIFO5_COR_ERR] = CNTR_ELEM("TxLaunchFifo5CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_fifo5_cor_err_cnt),
+[C_TX_LAUNCH_FIFO4_COR_ERR] = CNTR_ELEM("TxLaunchFifo4CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_fifo4_cor_err_cnt),
+[C_TX_LAUNCH_FIFO3_COR_ERR] = CNTR_ELEM("TxLaunchFifo3CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_fifo3_cor_err_cnt),
+[C_TX_LAUNCH_FIFO2_COR_ERR] = CNTR_ELEM("TxLaunchFifo2CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_fifo2_cor_err_cnt),
+[C_TX_LAUNCH_FIFO1_COR_ERR] = CNTR_ELEM("TxLaunchFifo1CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_fifo1_cor_err_cnt),
+[C_TX_LAUNCH_FIFO0_COR_ERR] = CNTR_ELEM("TxLaunchFifo0CorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_fifo0_cor_err_cnt),
+[C_TX_CREDIT_RETURN_VL_ERR] = CNTR_ELEM("TxCreditReturnVLErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_credit_return_vl_err_cnt),
+[C_TX_HCRC_INSERTION_ERR] = CNTR_ELEM("TxHcrcInsertionErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_hcrc_insertion_err_cnt),
+[C_TX_EGRESS_FIFI_UNC_ERR] = CNTR_ELEM("TxEgressFifoUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_egress_fifo_unc_err_cnt),
+[C_TX_READ_PIO_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_read_pio_memory_unc_err_cnt),
+[C_TX_READ_SDMA_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_read_sdma_memory_unc_err_cnt),
+[C_TX_SB_HDR_UNC_ERR] = CNTR_ELEM("TxSbHdrUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_sb_hdr_unc_err_cnt),
+[C_TX_CREDIT_RETURN_PARITY_ERR] = CNTR_ELEM("TxCreditReturnParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_credit_return_partiy_err_cnt),
+[C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo8UncOrParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_launch_fifo8_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo7UncOrParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_launch_fifo7_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo6UncOrParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_launch_fifo6_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo5UncOrParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_launch_fifo5_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo4UncOrParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_launch_fifo4_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo3UncOrParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_launch_fifo3_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo2UncOrParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_launch_fifo2_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo1UncOrParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_launch_fifo1_unc_or_parity_err_cnt),
+[C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo0UncOrParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_launch_fifo0_unc_or_parity_err_cnt),
+[C_TX_SDMA15_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma15DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma15_disallowed_packet_err_cnt),
+[C_TX_SDMA14_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma14DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma14_disallowed_packet_err_cnt),
+[C_TX_SDMA13_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma13DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma13_disallowed_packet_err_cnt),
+[C_TX_SDMA12_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma12DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma12_disallowed_packet_err_cnt),
+[C_TX_SDMA11_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma11DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma11_disallowed_packet_err_cnt),
+[C_TX_SDMA10_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma10DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma10_disallowed_packet_err_cnt),
+[C_TX_SDMA9_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma9DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma9_disallowed_packet_err_cnt),
+[C_TX_SDMA8_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma8DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma8_disallowed_packet_err_cnt),
+[C_TX_SDMA7_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma7DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma7_disallowed_packet_err_cnt),
+[C_TX_SDMA6_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma6DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma6_disallowed_packet_err_cnt),
+[C_TX_SDMA5_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma5DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma5_disallowed_packet_err_cnt),
+[C_TX_SDMA4_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma4DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma4_disallowed_packet_err_cnt),
+[C_TX_SDMA3_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma3DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma3_disallowed_packet_err_cnt),
+[C_TX_SDMA2_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma2DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma2_disallowed_packet_err_cnt),
+[C_TX_SDMA1_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma1DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma1_disallowed_packet_err_cnt),
+[C_TX_SDMA0_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma0DisallowedPacketErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma0_disallowed_packet_err_cnt),
+[C_TX_CONFIG_PARITY_ERR] = CNTR_ELEM("TxConfigParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_config_parity_err_cnt),
+[C_TX_SBRD_CTL_CSR_PARITY_ERR] = CNTR_ELEM("TxSbrdCtlCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_sbrd_ctl_csr_parity_err_cnt),
+[C_TX_LAUNCH_CSR_PARITY_ERR] = CNTR_ELEM("TxLaunchCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_launch_csr_parity_err_cnt),
+[C_TX_ILLEGAL_CL_ERR] = CNTR_ELEM("TxIllegalVLErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_illegal_vl_err_cnt),
+[C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR] = CNTR_ELEM(
+ "TxSbrdCtlStateMachineParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_sbrd_ctl_state_machine_parity_err_cnt),
+[C_TX_RESERVED_10] = CNTR_ELEM("Tx Egress Reserved 10", 0, 0,
+ CNTR_NORMAL,
+ access_egress_reserved_10_err_cnt),
+[C_TX_RESERVED_9] = CNTR_ELEM("Tx Egress Reserved 9", 0, 0,
+ CNTR_NORMAL,
+ access_egress_reserved_9_err_cnt),
+[C_TX_SDMA_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxSdmaLaunchIntfParityErr",
+ 0, 0, CNTR_NORMAL,
+ access_tx_sdma_launch_intf_parity_err_cnt),
+[C_TX_PIO_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxPioLaunchIntfParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_pio_launch_intf_parity_err_cnt),
+[C_TX_RESERVED_6] = CNTR_ELEM("Tx Egress Reserved 6", 0, 0,
+ CNTR_NORMAL,
+ access_egress_reserved_6_err_cnt),
+[C_TX_INCORRECT_LINK_STATE_ERR] = CNTR_ELEM("TxIncorrectLinkStateErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_incorrect_link_state_err_cnt),
+[C_TX_LINK_DOWN_ERR] = CNTR_ELEM("TxLinkdownErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_linkdown_err_cnt),
+[C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR] = CNTR_ELEM(
+ "EgressFifoUnderrunOrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_egress_fifi_underrun_or_parity_err_cnt),
+[C_TX_RESERVED_2] = CNTR_ELEM("Tx Egress Reserved 2", 0, 0,
+ CNTR_NORMAL,
+ access_egress_reserved_2_err_cnt),
+[C_TX_PKT_INTEGRITY_MEM_UNC_ERR] = CNTR_ELEM("TxPktIntegrityMemUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_pkt_integrity_mem_unc_err_cnt),
+[C_TX_PKT_INTEGRITY_MEM_COR_ERR] = CNTR_ELEM("TxPktIntegrityMemCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_tx_pkt_integrity_mem_cor_err_cnt),
+/* SendErrStatus */
+[C_SEND_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("SendCsrWriteBadAddrErr", 0, 0,
+ CNTR_NORMAL,
+ access_send_csr_write_bad_addr_err_cnt),
+[C_SEND_CSR_READ_BAD_ADD_ERR] = CNTR_ELEM("SendCsrReadBadAddrErr", 0, 0,
+ CNTR_NORMAL,
+ access_send_csr_read_bad_addr_err_cnt),
+[C_SEND_CSR_PARITY_ERR] = CNTR_ELEM("SendCsrParityErr", 0, 0,
+ CNTR_NORMAL,
+ access_send_csr_parity_cnt),
+/* SendCtxtErrStatus */
+[C_PIO_WRITE_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("PioWriteOutOfBoundsErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_write_out_of_bounds_err_cnt),
+[C_PIO_WRITE_OVERFLOW_ERR] = CNTR_ELEM("PioWriteOverflowErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_write_overflow_err_cnt),
+[C_PIO_WRITE_CROSSES_BOUNDARY_ERR] = CNTR_ELEM("PioWriteCrossesBoundaryErr",
+ 0, 0, CNTR_NORMAL,
+ access_pio_write_crosses_boundary_err_cnt),
+[C_PIO_DISALLOWED_PACKET_ERR] = CNTR_ELEM("PioDisallowedPacketErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_disallowed_packet_err_cnt),
+[C_PIO_INCONSISTENT_SOP_ERR] = CNTR_ELEM("PioInconsistentSopErr", 0, 0,
+ CNTR_NORMAL,
+ access_pio_inconsistent_sop_err_cnt),
+/* SendDmaEngErrStatus */
+[C_SDMA_HEADER_REQUEST_FIFO_COR_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoCorErr",
+ 0, 0, CNTR_NORMAL,
+ access_sdma_header_request_fifo_cor_err_cnt),
+[C_SDMA_HEADER_STORAGE_COR_ERR] = CNTR_ELEM("SDmaHeaderStorageCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_header_storage_cor_err_cnt),
+[C_SDMA_PACKET_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPacketTrackingCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_packet_tracking_cor_err_cnt),
+[C_SDMA_ASSEMBLY_COR_ERR] = CNTR_ELEM("SDmaAssemblyCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_assembly_cor_err_cnt),
+[C_SDMA_DESC_TABLE_COR_ERR] = CNTR_ELEM("SDmaDescTableCorErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_desc_table_cor_err_cnt),
+[C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoUncErr",
+ 0, 0, CNTR_NORMAL,
+ access_sdma_header_request_fifo_unc_err_cnt),
+[C_SDMA_HEADER_STORAGE_UNC_ERR] = CNTR_ELEM("SDmaHeaderStorageUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_header_storage_unc_err_cnt),
+[C_SDMA_PACKET_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPacketTrackingUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_packet_tracking_unc_err_cnt),
+[C_SDMA_ASSEMBLY_UNC_ERR] = CNTR_ELEM("SDmaAssemblyUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_assembly_unc_err_cnt),
+[C_SDMA_DESC_TABLE_UNC_ERR] = CNTR_ELEM("SDmaDescTableUncErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_desc_table_unc_err_cnt),
+[C_SDMA_TIMEOUT_ERR] = CNTR_ELEM("SDmaTimeoutErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_timeout_err_cnt),
+[C_SDMA_HEADER_LENGTH_ERR] = CNTR_ELEM("SDmaHeaderLengthErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_header_length_err_cnt),
+[C_SDMA_HEADER_ADDRESS_ERR] = CNTR_ELEM("SDmaHeaderAddressErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_header_address_err_cnt),
+[C_SDMA_HEADER_SELECT_ERR] = CNTR_ELEM("SDmaHeaderSelectErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_header_select_err_cnt),
+[C_SMDA_RESERVED_9] = CNTR_ELEM("SDma Reserved 9", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_reserved_9_err_cnt),
+[C_SDMA_PACKET_DESC_OVERFLOW_ERR] = CNTR_ELEM("SDmaPacketDescOverflowErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_packet_desc_overflow_err_cnt),
+[C_SDMA_LENGTH_MISMATCH_ERR] = CNTR_ELEM("SDmaLengthMismatchErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_length_mismatch_err_cnt),
+[C_SDMA_HALT_ERR] = CNTR_ELEM("SDmaHaltErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_halt_err_cnt),
+[C_SDMA_MEM_READ_ERR] = CNTR_ELEM("SDmaMemReadErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_mem_read_err_cnt),
+[C_SDMA_FIRST_DESC_ERR] = CNTR_ELEM("SDmaFirstDescErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_first_desc_err_cnt),
+[C_SDMA_TAIL_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("SDmaTailOutOfBoundsErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_tail_out_of_bounds_err_cnt),
+[C_SDMA_TOO_LONG_ERR] = CNTR_ELEM("SDmaTooLongErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_too_long_err_cnt),
+[C_SDMA_GEN_MISMATCH_ERR] = CNTR_ELEM("SDmaGenMismatchErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_gen_mismatch_err_cnt),
+[C_SDMA_WRONG_DW_ERR] = CNTR_ELEM("SDmaWrongDwErr", 0, 0,
+ CNTR_NORMAL,
+ access_sdma_wrong_dw_err_cnt),
+};
+
+static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
+[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
+ CNTR_NORMAL),
+[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
+ CNTR_NORMAL),
+[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
+ CNTR_NORMAL),
+[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
+ CNTR_NORMAL),
+[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
+ CNTR_NORMAL),
+[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
+ CNTR_NORMAL),
+[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
+ CNTR_NORMAL),
+[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
+[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
+[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
+[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
+ CNTR_SYNTH | CNTR_VL),
+[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
+[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
+[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+ access_sw_link_dn_cnt),
+[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+ access_sw_link_up_cnt),
+[C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL,
+ access_sw_unknown_frame_cnt),
+[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+ access_sw_xmit_discards),
+[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
+ CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
+ access_sw_xmit_discards),
+[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
+ access_xmit_constraint_errs),
+[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
+ access_rcv_constraint_errs),
+[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
+[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
+[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
+[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
+[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
+[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
+[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
+[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
+[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
+[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
+[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
+[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_rc_acks),
+[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_rc_qacks),
+[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
+ access_sw_cpu_rc_delayed_comp),
+[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
+[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
+[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
+[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
+[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
+[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
+[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
+[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
+[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
+[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
+[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
+[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
+[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
+[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
+[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
+[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
+[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
+[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
+[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
+[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
+[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
+[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
+[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
+[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
+[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
+[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
+[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
+[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
+[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
+[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
+[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
+[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
+[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
+[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
+[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
+[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
+[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
+[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
+[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
+[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
+[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
+[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
+[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
+[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
+[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
+[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
+[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
+[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
+[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
+[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
+[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
+[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
+[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
+[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
+[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
+[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
+[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
+[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
+[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
+[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
+[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
+[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
+[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
+[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
+[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
+[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
+[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
+[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
+[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
+[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
+[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
+[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
+[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
+[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
+[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
+[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
+[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
+[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
+[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
+[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
+};
+
+/* ======================================================================== */
+
+/* return true if this is chip revision revision a */
+int is_ax(struct hfi1_devdata *dd)
+{
+ u8 chip_rev_minor =
+ dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+ & CCE_REVISION_CHIP_REV_MINOR_MASK;
+ return (chip_rev_minor & 0xf0) == 0;
+}
+
+/* return true if this is chip revision revision b */
+int is_bx(struct hfi1_devdata *dd)
+{
+ u8 chip_rev_minor =
+ dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+ & CCE_REVISION_CHIP_REV_MINOR_MASK;
+ return (chip_rev_minor & 0xF0) == 0x10;
+}
+
+/*
+ * Append string s to buffer buf. Arguments curp and len are the current
+ * position and remaining length, respectively.
+ *
+ * return 0 on success, 1 on out of room
+ */
+static int append_str(char *buf, char **curp, int *lenp, const char *s)
+{
+ char *p = *curp;
+ int len = *lenp;
+ int result = 0; /* success */
+ char c;
+
+ /* add a comma, if first in the buffer */
+ if (p != buf) {
+ if (len == 0) {
+ result = 1; /* out of room */
+ goto done;
+ }
+ *p++ = ',';
+ len--;
+ }
+
+ /* copy the string */
+ while ((c = *s++) != 0) {
+ if (len == 0) {
+ result = 1; /* out of room */
+ goto done;
+ }
+ *p++ = c;
+ len--;
+ }
+
+done:
+ /* write return values */
+ *curp = p;
+ *lenp = len;
+
+ return result;
+}
+
+/*
+ * Using the given flag table, print a comma separated string into
+ * the buffer. End in '*' if the buffer is too short.
+ */
+static char *flag_string(char *buf, int buf_len, u64 flags,
+ struct flag_table *table, int table_size)
+{
+ char extra[32];
+ char *p = buf;
+ int len = buf_len;
+ int no_room = 0;
+ int i;
+
+ /* make sure there is at least 2 so we can form "*" */
+ if (len < 2)
+ return "";
+
+ len--; /* leave room for a nul */
+ for (i = 0; i < table_size; i++) {
+ if (flags & table[i].flag) {
+ no_room = append_str(buf, &p, &len, table[i].str);
+ if (no_room)
+ break;
+ flags &= ~table[i].flag;
+ }
+ }
+
+ /* any undocumented bits left? */
+ if (!no_room && flags) {
+ snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
+ no_room = append_str(buf, &p, &len, extra);
+ }
+
+ /* add * if ran out of room */
+ if (no_room) {
+ /* may need to back up to add space for a '*' */
+ if (len == 0)
+ --p;
+ *p++ = '*';
+ }
+
+ /* add final nul - space already allocated above */
+ *p = 0;
+ return buf;
+}
+
+/* first 8 CCE error interrupt source names */
+static const char * const cce_misc_names[] = {
+ "CceErrInt", /* 0 */
+ "RxeErrInt", /* 1 */
+ "MiscErrInt", /* 2 */
+ "Reserved3", /* 3 */
+ "PioErrInt", /* 4 */
+ "SDmaErrInt", /* 5 */
+ "EgressErrInt", /* 6 */
+ "TxeErrInt" /* 7 */
+};
+
+/*
+ * Return the miscellaneous error interrupt name.
+ */
+static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
+{
+ if (source < ARRAY_SIZE(cce_misc_names))
+ strncpy(buf, cce_misc_names[source], bsize);
+ else
+ snprintf(buf, bsize, "Reserved%u",
+ source + IS_GENERAL_ERR_START);
+
+ return buf;
+}
+
+/*
+ * Return the SDMA engine error interrupt name.
+ */
+static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "SDmaEngErrInt%u", source);
+ return buf;
+}
+
+/*
+ * Return the send context error interrupt name.
+ */
+static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "SendCtxtErrInt%u", source);
+ return buf;
+}
+
+static const char * const various_names[] = {
+ "PbcInt",
+ "GpioAssertInt",
+ "Qsfp1Int",
+ "Qsfp2Int",
+ "TCritInt"
+};
+
+/*
+ * Return the various interrupt name.
+ */
+static char *is_various_name(char *buf, size_t bsize, unsigned int source)
+{
+ if (source < ARRAY_SIZE(various_names))
+ strncpy(buf, various_names[source], bsize);
+ else
+ snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START);
+ return buf;
+}
+
+/*
+ * Return the DC interrupt name.
+ */
+static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
+{
+ static const char * const dc_int_names[] = {
+ "common",
+ "lcb",
+ "8051",
+ "lbm" /* local block merge */
+ };
+
+ if (source < ARRAY_SIZE(dc_int_names))
+ snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
+ else
+ snprintf(buf, bsize, "DCInt%u", source);
+ return buf;
+}
+
+static const char * const sdma_int_names[] = {
+ "SDmaInt",
+ "SdmaIdleInt",
+ "SdmaProgressInt",
+};
+
+/*
+ * Return the SDMA engine interrupt name.
+ */
+static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
+{
+ /* what interrupt */
+ unsigned int what = source / TXE_NUM_SDMA_ENGINES;
+ /* which engine */
+ unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+ if (likely(what < 3))
+ snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
+ else
+ snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
+ return buf;
+}
+
+/*
+ * Return the receive available interrupt name.
+ */
+static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "RcvAvailInt%u", source);
+ return buf;
+}
+
+/*
+ * Return the receive urgent interrupt name.
+ */
+static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "RcvUrgentInt%u", source);
+ return buf;
+}
+
+/*
+ * Return the send credit interrupt name.
+ */
+static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "SendCreditInt%u", source);
+ return buf;
+}
+
+/*
+ * Return the reserved interrupt name.
+ */
+static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
+{
+ snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
+ return buf;
+}
+
+static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ cce_err_status_flags,
+ ARRAY_SIZE(cce_err_status_flags));
+}
+
+static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ rxe_err_status_flags,
+ ARRAY_SIZE(rxe_err_status_flags));
+}
+
+static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, misc_err_status_flags,
+ ARRAY_SIZE(misc_err_status_flags));
+}
+
+static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ pio_err_status_flags,
+ ARRAY_SIZE(pio_err_status_flags));
+}
+
+static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ sdma_err_status_flags,
+ ARRAY_SIZE(sdma_err_status_flags));
+}
+
+static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ egress_err_status_flags,
+ ARRAY_SIZE(egress_err_status_flags));
+}
+
+static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ egress_err_info_flags,
+ ARRAY_SIZE(egress_err_info_flags));
+}
+
+static char *send_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ send_err_status_flags,
+ ARRAY_SIZE(send_err_status_flags));
+}
+
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+ int i = 0;
+
+ /*
+ * For most these errors, there is nothing that can be done except
+ * report or record it.
+ */
+ dd_dev_info(dd, "CCE Error: %s\n",
+ cce_err_status_string(buf, sizeof(buf), reg));
+
+ if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) &&
+ is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
+ /* this error requires a manual drop into SPC freeze mode */
+ /* then a fix up */
+ start_freeze_handling(dd->pport, FREEZE_SELF);
+ }
+
+ for (i = 0; i < NUM_CCE_ERR_STATUS_COUNTERS; i++) {
+ if (reg & (1ull << i)) {
+ incr_cntr64(&dd->cce_err_status_cnt[i]);
+ /* maintain a counter over all cce_err_status errors */
+ incr_cntr64(&dd->sw_cce_err_status_aggregate);
+ }
+ }
+}
+
+/*
+ * Check counters for receive errors that do not have an interrupt
+ * associated with them.
+ */
+#define RCVERR_CHECK_TIME 10
+static void update_rcverr_timer(unsigned long opaque)
+{
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+ struct hfi1_pportdata *ppd = dd->pport;
+ u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+
+ if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
+ ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
+ dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+ set_link_down_reason(
+ ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
+ OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
+ queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+ }
+ dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt;
+
+ mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static int init_rcverr(struct hfi1_devdata *dd)
+{
+ setup_timer(&dd->rcverr_timer, update_rcverr_timer, (unsigned long)dd);
+ /* Assume the hardware counter has been reset */
+ dd->rcv_ovfl_cnt = 0;
+ return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static void free_rcverr(struct hfi1_devdata *dd)
+{
+ if (dd->rcverr_timer.data)
+ del_timer_sync(&dd->rcverr_timer);
+ dd->rcverr_timer.data = 0;
+}
+
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+ int i = 0;
+
+ dd_dev_info(dd, "Receive Error: %s\n",
+ rxe_err_status_string(buf, sizeof(buf), reg));
+
+ if (reg & ALL_RXE_FREEZE_ERR) {
+ int flags = 0;
+
+ /*
+ * Freeze mode recovery is disabled for the errors
+ * in RXE_FREEZE_ABORT_MASK
+ */
+ if (is_ax(dd) && (reg & RXE_FREEZE_ABORT_MASK))
+ flags = FREEZE_ABORT;
+
+ start_freeze_handling(dd->pport, flags);
+ }
+
+ for (i = 0; i < NUM_RCV_ERR_STATUS_COUNTERS; i++) {
+ if (reg & (1ull << i))
+ incr_cntr64(&dd->rcv_err_status_cnt[i]);
+ }
+}
+
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+ int i = 0;
+
+ dd_dev_info(dd, "Misc Error: %s",
+ misc_err_status_string(buf, sizeof(buf), reg));
+ for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) {
+ if (reg & (1ull << i))
+ incr_cntr64(&dd->misc_err_status_cnt[i]);
+ }
+}
+
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+ int i = 0;
+
+ dd_dev_info(dd, "PIO Error: %s\n",
+ pio_err_status_string(buf, sizeof(buf), reg));
+
+ if (reg & ALL_PIO_FREEZE_ERR)
+ start_freeze_handling(dd->pport, 0);
+
+ for (i = 0; i < NUM_SEND_PIO_ERR_STATUS_COUNTERS; i++) {
+ if (reg & (1ull << i))
+ incr_cntr64(&dd->send_pio_err_status_cnt[i]);
+ }
+}
+
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+ int i = 0;
+
+ dd_dev_info(dd, "SDMA Error: %s\n",
+ sdma_err_status_string(buf, sizeof(buf), reg));
+
+ if (reg & ALL_SDMA_FREEZE_ERR)
+ start_freeze_handling(dd->pport, 0);
+
+ for (i = 0; i < NUM_SEND_DMA_ERR_STATUS_COUNTERS; i++) {
+ if (reg & (1ull << i))
+ incr_cntr64(&dd->send_dma_err_status_cnt[i]);
+ }
+}
+
+static inline void __count_port_discards(struct hfi1_pportdata *ppd)
+{
+ incr_cntr64(&ppd->port_xmit_discards);
+}
+
+static void count_port_inactive(struct hfi1_devdata *dd)
+{
+ __count_port_discards(dd->pport);
+}
+
+/*
+ * We have had a "disallowed packet" error during egress. Determine the
+ * integrity check which failed, and update relevant error counter, etc.
+ *
+ * Note that the SEND_EGRESS_ERR_INFO register has only a single
+ * bit of state per integrity check, and so we can miss the reason for an
+ * egress error if more than one packet fails the same integrity check
+ * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
+ */
+static void handle_send_egress_err_info(struct hfi1_devdata *dd,
+ int vl)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+ u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
+ u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
+ char buf[96];
+
+ /* clear down all observed info as quickly as possible after read */
+ write_csr(dd, SEND_EGRESS_ERR_INFO, info);
+
+ dd_dev_info(dd,
+ "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
+ info, egress_err_info_string(buf, sizeof(buf), info), src);
+
+ /* Eventually add other counters for each bit */
+ if (info & PORT_DISCARD_EGRESS_ERRS) {
+ int weight, i;
+
+ /*
+ * Count all applicable bits as individual errors and
+ * attribute them to the packet that triggered this handler.
+ * This may not be completely accurate due to limitations
+ * on the available hardware error information. There is
+ * a single information register and any number of error
+ * packets may have occurred and contributed to it before
+ * this routine is called. This means that:
+ * a) If multiple packets with the same error occur before
+ * this routine is called, earlier packets are missed.
+ * There is only a single bit for each error type.
+ * b) Errors may not be attributed to the correct VL.
+ * The driver is attributing all bits in the info register
+ * to the packet that triggered this call, but bits
+ * could be an accumulation of different packets with
+ * different VLs.
+ * c) A single error packet may have multiple counts attached
+ * to it. There is no way for the driver to know if
+ * multiple bits set in the info register are due to a
+ * single packet or multiple packets. The driver assumes
+ * multiple packets.
+ */
+ weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS);
+ for (i = 0; i < weight; i++) {
+ __count_port_discards(ppd);
+ if (vl >= 0 && vl < TXE_NUM_DATA_VL)
+ incr_cntr64(&ppd->port_xmit_discards_vl[vl]);
+ else if (vl == 15)
+ incr_cntr64(&ppd->port_xmit_discards_vl
+ [C_VL_15]);
+ }
+ }
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'port inactive' error?
+ */
+static inline int port_inactive_err(u64 posn)
+{
+ return (posn >= SEES(TX_LINKDOWN) &&
+ posn <= SEES(TX_INCORRECT_LINK_STATE));
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'disallowed packet' error?
+ */
+static inline int disallowed_pkt_err(int posn)
+{
+ return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
+ posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
+}
+
+/*
+ * Input value is a bit position of one of the SDMA engine disallowed
+ * packet errors. Return which engine. Use of this must be guarded by
+ * disallowed_pkt_err().
+ */
+static inline int disallowed_pkt_engine(int posn)
+{
+ return posn - SEES(TX_SDMA0_DISALLOWED_PACKET);
+}
+
+/*
+ * Translate an SDMA engine to a VL. Return -1 if the tranlation cannot
+ * be done.
+ */
+static int engine_to_vl(struct hfi1_devdata *dd, int engine)
+{
+ struct sdma_vl_map *m;
+ int vl;
+
+ /* range check */
+ if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES)
+ return -1;
+
+ rcu_read_lock();
+ m = rcu_dereference(dd->sdma_map);
+ vl = m->engine_to_vl[engine];
+ rcu_read_unlock();
+
+ return vl;
+}
+
+/*
+ * Translate the send context (sofware index) into a VL. Return -1 if the
+ * translation cannot be done.
+ */
+static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
+{
+ struct send_context_info *sci;
+ struct send_context *sc;
+ int i;
+
+ sci = &dd->send_contexts[sw_index];
+
+ /* there is no information for user (PSM) and ack contexts */
+ if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
+ return -1;
+
+ sc = sci->sc;
+ if (!sc)
+ return -1;
+ if (dd->vld[15].sc == sc)
+ return 15;
+ for (i = 0; i < num_vls; i++)
+ if (dd->vld[i].sc == sc)
+ return i;
+
+ return -1;
+}
+
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ u64 reg_copy = reg, handled = 0;
+ char buf[96];
+ int i = 0;
+
+ if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
+ start_freeze_handling(dd->pport, 0);
+ else if (is_ax(dd) &&
+ (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) &&
+ (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
+ start_freeze_handling(dd->pport, 0);
+
+ while (reg_copy) {
+ int posn = fls64(reg_copy);
+ /* fls64() returns a 1-based offset, we want it zero based */
+ int shift = posn - 1;
+ u64 mask = 1ULL << shift;
+
+ if (port_inactive_err(shift)) {
+ count_port_inactive(dd);
+ handled |= mask;
+ } else if (disallowed_pkt_err(shift)) {
+ int vl = engine_to_vl(dd, disallowed_pkt_engine(shift));
+
+ handle_send_egress_err_info(dd, vl);
+ handled |= mask;
+ }
+ reg_copy &= ~mask;
+ }
+
+ reg &= ~handled;
+
+ if (reg)
+ dd_dev_info(dd, "Egress Error: %s\n",
+ egress_err_status_string(buf, sizeof(buf), reg));
+
+ for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) {
+ if (reg & (1ull << i))
+ incr_cntr64(&dd->send_egress_err_status_cnt[i]);
+ }
+}
+
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+ int i = 0;
+
+ dd_dev_info(dd, "Send Error: %s\n",
+ send_err_status_string(buf, sizeof(buf), reg));
+
+ for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) {
+ if (reg & (1ull << i))
+ incr_cntr64(&dd->send_err_status_cnt[i]);
+ }
+}
+
+/*
+ * The maximum number of times the error clear down will loop before
+ * blocking a repeating error. This value is arbitrary.
+ */
+#define MAX_CLEAR_COUNT 20
+
+/*
+ * Clear and handle an error register. All error interrupts are funneled
+ * through here to have a central location to correctly handle single-
+ * or multi-shot errors.
+ *
+ * For non per-context registers, call this routine with a context value
+ * of 0 so the per-context offset is zero.
+ *
+ * If the handler loops too many times, assume that something is wrong
+ * and can't be fixed, so mask the error bits.
+ */
+static void interrupt_clear_down(struct hfi1_devdata *dd,
+ u32 context,
+ const struct err_reg_info *eri)
+{
+ u64 reg;
+ u32 count;
+
+ /* read in a loop until no more errors are seen */
+ count = 0;
+ while (1) {
+ reg = read_kctxt_csr(dd, context, eri->status);
+ if (reg == 0)
+ break;
+ write_kctxt_csr(dd, context, eri->clear, reg);
+ if (likely(eri->handler))
+ eri->handler(dd, context, reg);
+ count++;
+ if (count > MAX_CLEAR_COUNT) {
+ u64 mask;
+
+ dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
+ eri->desc, reg);
+ /*
+ * Read-modify-write so any other masked bits
+ * remain masked.
+ */
+ mask = read_kctxt_csr(dd, context, eri->mask);
+ mask &= ~reg;
+ write_kctxt_csr(dd, context, eri->mask, mask);
+ break;
+ }
+ }
+}
+
+/*
+ * CCE block "misc" interrupt. Source is < 16.
+ */
+static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ const struct err_reg_info *eri = &misc_errs[source];
+
+ if (eri->handler) {
+ interrupt_clear_down(dd, 0, eri);
+ } else {
+ dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
+ source);
+ }
+}
+
+static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags,
+ sc_err_status_flags,
+ ARRAY_SIZE(sc_err_status_flags));
+}
+
+/*
+ * Send context error interrupt. Source (hw_context) is < 160.
+ *
+ * All send context errors cause the send context to halt. The normal
+ * clear-down mechanism cannot be used because we cannot clear the
+ * error bits until several other long-running items are done first.
+ * This is OK because with the context halted, nothing else is going
+ * to happen on it anyway.
+ */
+static void is_sendctxt_err_int(struct hfi1_devdata *dd,
+ unsigned int hw_context)
+{
+ struct send_context_info *sci;
+ struct send_context *sc;
+ char flags[96];
+ u64 status;
+ u32 sw_index;
+ int i = 0;
+
+ sw_index = dd->hw_to_sw[hw_context];
+ if (sw_index >= dd->num_send_contexts) {
+ dd_dev_err(dd,
+ "out of range sw index %u for send context %u\n",
+ sw_index, hw_context);
+ return;
+ }
+ sci = &dd->send_contexts[sw_index];
+ sc = sci->sc;
+ if (!sc) {
+ dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
+ sw_index, hw_context);
+ return;
+ }
+
+ /* tell the software that a halt has begun */
+ sc_stop(sc, SCF_HALTED);
+
+ status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
+
+ dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
+ send_context_err_status_string(flags, sizeof(flags),
+ status));
+
+ if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
+ handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index));
+
+ /*
+ * Automatically restart halted kernel contexts out of interrupt
+ * context. User contexts must ask the driver to restart the context.
+ */
+ if (sc->type != SC_USER)
+ queue_work(dd->pport->hfi1_wq, &sc->halt_work);
+
+ /*
+ * Update the counters for the corresponding status bits.
+ * Note that these particular counters are aggregated over all
+ * 160 contexts.
+ */
+ for (i = 0; i < NUM_SEND_CTXT_ERR_STATUS_COUNTERS; i++) {
+ if (status & (1ull << i))
+ incr_cntr64(&dd->sw_ctxt_err_status_cnt[i]);
+ }
+}
+
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+ unsigned int source, u64 status)
+{
+ struct sdma_engine *sde;
+ int i = 0;
+
+ sde = &dd->per_sdma[source];
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+ slashstrip(__FILE__), __LINE__, __func__);
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
+ sde->this_idx, source, (unsigned long long)status);
+#endif
+ sde->err_cnt++;
+ sdma_engine_error(sde, status);
+
+ /*
+ * Update the counters for the corresponding status bits.
+ * Note that these particular counters are aggregated over
+ * all 16 DMA engines.
+ */
+ for (i = 0; i < NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS; i++) {
+ if (status & (1ull << i))
+ incr_cntr64(&dd->sw_send_dma_eng_err_status_cnt[i]);
+ }
+}
+
+/*
+ * CCE block SDMA error interrupt. Source is < 16.
+ */
+static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+ struct sdma_engine *sde = &dd->per_sdma[source];
+
+ dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+ slashstrip(__FILE__), __LINE__, __func__);
+ dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
+ source);
+ sdma_dumpstate(sde);
+#endif
+ interrupt_clear_down(dd, source, &sdma_eng_err);
+}
+
+/*
+ * CCE block "various" interrupt. Source is < 8.
+ */
+static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ const struct err_reg_info *eri = &various_err[source];
+
+ /*
+ * TCritInt cannot go through interrupt_clear_down()
+ * because it is not a second tier interrupt. The handler
+ * should be called directly.
+ */
+ if (source == TCRIT_INT_SOURCE)
+ handle_temp_err(dd);
+ else if (eri->handler)
+ interrupt_clear_down(dd, 0, eri);
+ else
+ dd_dev_info(dd,
+ "%s: Unimplemented/reserved interrupt %d\n",
+ __func__, source);
+}
+
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
+{
+ /* src_ctx is always zero */
+ struct hfi1_pportdata *ppd = dd->pport;
+ unsigned long flags;
+ u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+
+ if (reg & QSFP_HFI0_MODPRST_N) {
+ if (!qsfp_mod_present(ppd)) {
+ dd_dev_info(dd, "%s: QSFP module removed\n",
+ __func__);
+
+ ppd->driver_link_ready = 0;
+ /*
+ * Cable removed, reset all our information about the
+ * cache and cable capabilities
+ */
+
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ /*
+ * We don't set cache_refresh_required here as we expect
+ * an interrupt when a cable is inserted
+ */
+ ppd->qsfp_info.cache_valid = 0;
+ ppd->qsfp_info.reset_needed = 0;
+ ppd->qsfp_info.limiting_active = 0;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+ flags);
+ /* Invert the ModPresent pin now to detect plug-in */
+ write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+ ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+
+ if ((ppd->offline_disabled_reason >
+ HFI1_ODR_MASK(
+ OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) ||
+ (ppd->offline_disabled_reason ==
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(
+ OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+
+ if (ppd->host_link_state == HLS_DN_POLL) {
+ /*
+ * The link is still in POLL. This means
+ * that the normal link down processing
+ * will not happen. We have to do it here
+ * before turning the DC off.
+ */
+ queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+ }
+ } else {
+ dd_dev_info(dd, "%s: QSFP module inserted\n",
+ __func__);
+
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ ppd->qsfp_info.cache_valid = 0;
+ ppd->qsfp_info.cache_refresh_required = 1;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+ flags);
+
+ /*
+ * Stop inversion of ModPresent pin to detect
+ * removal of the cable
+ */
+ qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
+ write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT :
+ ASIC_QSFP1_INVERT, qsfp_int_mgmt);
+
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
+ }
+ }
+
+ if (reg & QSFP_HFI0_INT_N) {
+ dd_dev_info(dd, "%s: Interrupt received from QSFP module\n",
+ __func__);
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ ppd->qsfp_info.check_interrupt_flags = 1;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+ }
+
+ /* Schedule the QSFP work only if there is a cable attached. */
+ if (qsfp_mod_present(ppd))
+ queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
+}
+
+static int request_host_lcb_access(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ ret = do_8051_command(dd, HCMD_MISC,
+ (u64)HCMD_MISC_REQUEST_LCB_ACCESS <<
+ LOAD_DATA_FIELD_ID_SHIFT, NULL);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd, "%s: command failed with error %d\n",
+ __func__, ret);
+ }
+ return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+static int request_8051_lcb_access(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ ret = do_8051_command(dd, HCMD_MISC,
+ (u64)HCMD_MISC_GRANT_LCB_ACCESS <<
+ LOAD_DATA_FIELD_ID_SHIFT, NULL);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd, "%s: command failed with error %d\n",
+ __func__, ret);
+ }
+ return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+/*
+ * Set the LCB selector - allow host access. The DCC selector always
+ * points to the host.
+ */
+static inline void set_host_lcb_access(struct hfi1_devdata *dd)
+{
+ write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+ DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK |
+ DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
+}
+
+/*
+ * Clear the LCB selector - allow 8051 access. The DCC selector always
+ * points to the host.
+ */
+static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
+{
+ write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+ DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
+}
+
+/*
+ * Acquire LCB access from the 8051. If the host already has access,
+ * just increment a counter. Otherwise, inform the 8051 that the
+ * host is taking access.
+ *
+ * Returns:
+ * 0 on success
+ * -EBUSY if the 8051 has control and cannot be disturbed
+ * -errno if unable to acquire access from the 8051
+ */
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+ int ret = 0;
+
+ /*
+ * Use the host link state lock so the operation of this routine
+ * { link state check, selector change, count increment } can occur
+ * as a unit against a link state change. Otherwise there is a
+ * race between the state change and the count increment.
+ */
+ if (sleep_ok) {
+ mutex_lock(&ppd->hls_lock);
+ } else {
+ while (!mutex_trylock(&ppd->hls_lock))
+ udelay(1);
+ }
+
+ /* this access is valid only when the link is up */
+ if (ppd->host_link_state & HLS_DOWN) {
+ dd_dev_info(dd, "%s: link state %s not up\n",
+ __func__, link_state_name(ppd->host_link_state));
+ ret = -EBUSY;
+ goto done;
+ }
+
+ if (dd->lcb_access_count == 0) {
+ ret = request_host_lcb_access(dd);
+ if (ret) {
+ dd_dev_err(dd,
+ "%s: unable to acquire LCB access, err %d\n",
+ __func__, ret);
+ goto done;
+ }
+ set_host_lcb_access(dd);
+ }
+ dd->lcb_access_count++;
+done:
+ mutex_unlock(&ppd->hls_lock);
+ return ret;
+}
+
+/*
+ * Release LCB access by decrementing the use count. If the count is moving
+ * from 1 to 0, inform 8051 that it has control back.
+ *
+ * Returns:
+ * 0 on success
+ * -errno if unable to release access to the 8051
+ */
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+ int ret = 0;
+
+ /*
+ * Use the host link state lock because the acquire needed it.
+ * Here, we only need to keep { selector change, count decrement }
+ * as a unit.
+ */
+ if (sleep_ok) {
+ mutex_lock(&dd->pport->hls_lock);
+ } else {
+ while (!mutex_trylock(&dd->pport->hls_lock))
+ udelay(1);
+ }
+
+ if (dd->lcb_access_count == 0) {
+ dd_dev_err(dd, "%s: LCB access count is zero. Skipping.\n",
+ __func__);
+ goto done;
+ }
+
+ if (dd->lcb_access_count == 1) {
+ set_8051_lcb_access(dd);
+ ret = request_8051_lcb_access(dd);
+ if (ret) {
+ dd_dev_err(dd,
+ "%s: unable to release LCB access, err %d\n",
+ __func__, ret);
+ /* restore host access if the grant didn't work */
+ set_host_lcb_access(dd);
+ goto done;
+ }
+ }
+ dd->lcb_access_count--;
+done:
+ mutex_unlock(&dd->pport->hls_lock);
+ return ret;
+}
+
+/*
+ * Initialize LCB access variables and state. Called during driver load,
+ * after most of the initialization is finished.
+ *
+ * The DC default is LCB access on for the host. The driver defaults to
+ * leaving access to the 8051. Assign access now - this constrains the call
+ * to this routine to be after all LCB set-up is done. In particular, after
+ * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
+ */
+static void init_lcb_access(struct hfi1_devdata *dd)
+{
+ dd->lcb_access_count = 0;
+}
+
+/*
+ * Write a response back to a 8051 request.
+ */
+static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
+{
+ write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
+ DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK |
+ (u64)return_code <<
+ DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT |
+ (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+}
+
+/*
+ * Handle host requests from the 8051.
+ */
+static void handle_8051_request(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 reg;
+ u16 data = 0;
+ u8 type;
+
+ reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
+ if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
+ return; /* no request */
+
+ /* zero out COMPLETED so the response is seen */
+ write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
+
+ /* extract request details */
+ type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
+ & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
+ data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
+ & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
+
+ switch (type) {
+ case HREQ_LOAD_CONFIG:
+ case HREQ_SAVE_CONFIG:
+ case HREQ_READ_CONFIG:
+ case HREQ_SET_TX_EQ_ABS:
+ case HREQ_SET_TX_EQ_REL:
+ case HREQ_ENABLE:
+ dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
+ type);
+ hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+ break;
+ case HREQ_CONFIG_DONE:
+ hreq_response(dd, HREQ_SUCCESS, 0);
+ break;
+
+ case HREQ_INTERFACE_TEST:
+ hreq_response(dd, HREQ_SUCCESS, data);
+ break;
+ default:
+ dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
+ hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+ break;
+ }
+}
+
+static void write_global_credit(struct hfi1_devdata *dd,
+ u8 vau, u16 total, u16 shared)
+{
+ write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+ ((u64)total <<
+ SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) |
+ ((u64)shared <<
+ SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) |
+ ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
+}
+
+/*
+ * Set up initial VL15 credits of the remote. Assumes the rest of
+ * the CM credit registers are zero from a previous global or credit reset .
+ */
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
+{
+ /* leave shared count at zero for both global and VL15 */
+ write_global_credit(dd, vau, vl15buf, 0);
+
+ /* We may need some credits for another VL when sending packets
+ * with the snoop interface. Dividing it down the middle for VL15
+ * and VL0 should suffice.
+ */
+ if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
+ write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
+ << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+ write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
+ << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
+ } else {
+ write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
+ << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+ }
+}
+
+/*
+ * Zero all credit details from the previous connection and
+ * reset the CM manager's internal counters.
+ */
+void reset_link_credits(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* remove all previous VL credit limits */
+ for (i = 0; i < TXE_NUM_DATA_VL; i++)
+ write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
+ write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+ write_global_credit(dd, 0, 0, 0);
+ /* reset the CM block */
+ pio_send_control(dd, PSC_CM_RESET);
+}
+
+/* convert a vCU to a CU */
+static u32 vcu_to_cu(u8 vcu)
+{
+ return 1 << vcu;
+}
+
+/* convert a CU to a vCU */
+static u8 cu_to_vcu(u32 cu)
+{
+ return ilog2(cu);
+}
+
+/* convert a vAU to an AU */
+static u32 vau_to_au(u8 vau)
+{
+ return 8 * (1 << vau);
+}
+
+static void set_linkup_defaults(struct hfi1_pportdata *ppd)
+{
+ ppd->sm_trap_qp = 0x0;
+ ppd->sa_qp = 0x1;
+}
+
+/*
+ * Graceful LCB shutdown. This leaves the LCB FIFOs in reset.
+ */
+static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
+{
+ u64 reg;
+
+ /* clear lcb run: LCB_CFG_RUN.EN = 0 */
+ write_csr(dd, DC_LCB_CFG_RUN, 0);
+ /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
+ 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
+ /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
+ dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
+ reg = read_csr(dd, DCC_CFG_RESET);
+ write_csr(dd, DCC_CFG_RESET, reg |
+ (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) |
+ (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
+ (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
+ if (!abort) {
+ udelay(1); /* must hold for the longer of 16cclks or 20ns */
+ write_csr(dd, DCC_CFG_RESET, reg);
+ write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+ }
+}
+
+/*
+ * This routine should be called after the link has been transitioned to
+ * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
+ * reset).
+ *
+ * The expectation is that the caller of this routine would have taken
+ * care of properly transitioning the link into the correct state.
+ */
+static void dc_shutdown(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+ if (dd->dc_shutdown) {
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+ return;
+ }
+ dd->dc_shutdown = 1;
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+ /* Shutdown the LCB */
+ lcb_shutdown(dd, 1);
+ /*
+ * Going to OFFLINE would have causes the 8051 to put the
+ * SerDes into reset already. Just need to shut down the 8051,
+ * itself.
+ */
+ write_csr(dd, DC_DC8051_CFG_RST, 0x1);
+}
+
+/*
+ * Calling this after the DC has been brought out of reset should not
+ * do any damage.
+ */
+static void dc_start(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+ if (!dd->dc_shutdown)
+ goto done;
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+ /* Take the 8051 out of reset */
+ write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+ /* Wait until 8051 is ready */
+ ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+ if (ret) {
+ dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
+ __func__);
+ }
+ /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
+ write_csr(dd, DCC_CFG_RESET, 0x10);
+ /* lcb_shutdown() with abort=1 does not restore these */
+ write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+ dd->dc_shutdown = 0;
+done:
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+}
+
+/*
+ * These LCB adjustments are for the Aurora SerDes core in the FPGA.
+ */
+static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
+{
+ u64 rx_radr, tx_radr;
+ u32 version;
+
+ if (dd->icode != ICODE_FPGA_EMULATION)
+ return;
+
+ /*
+ * These LCB defaults on emulator _s are good, nothing to do here:
+ * LCB_CFG_TX_FIFOS_RADR
+ * LCB_CFG_RX_FIFOS_RADR
+ * LCB_CFG_LN_DCLK
+ * LCB_CFG_IGNORE_LOST_RCLK
+ */
+ if (is_emulator_s(dd))
+ return;
+ /* else this is _p */
+
+ version = emulator_rev(dd);
+ if (!is_ax(dd))
+ version = 0x2d; /* all B0 use 0x2d or higher settings */
+
+ if (version <= 0x12) {
+ /* release 0x12 and below */
+
+ /*
+ * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
+ * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
+ * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
+ */
+ rx_radr =
+ 0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ /*
+ * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
+ * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
+ */
+ tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ } else if (version <= 0x18) {
+ /* release 0x13 up to 0x18 */
+ /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+ rx_radr =
+ 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ } else if (version == 0x19) {
+ /* release 0x19 */
+ /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
+ rx_radr =
+ 0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ } else if (version == 0x1a) {
+ /* release 0x1a */
+ /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+ rx_radr =
+ 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
+ } else {
+ /* release 0x1b and higher */
+ /* LCB_CFG_RX_FIFOS_RADR = 0x877 */
+ rx_radr =
+ 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+ | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+ | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+ tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+ }
+
+ write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
+ /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
+ write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
+ DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
+}
+
+/*
+ * Handle a SMA idle message
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_sma_message(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ sma_message_work);
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 msg;
+ int ret;
+
+ /*
+ * msg is bytes 1-4 of the 40-bit idle message - the command code
+ * is stripped off
+ */
+ ret = read_idle_sma(dd, &msg);
+ if (ret)
+ return;
+ dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
+ /*
+ * React to the SMA message. Byte[1] (0 for us) is the command.
+ */
+ switch (msg & 0xff) {
+ case SMA_IDLE_ARM:
+ /*
+ * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+ * State Transitions
+ *
+ * Only expected in INIT or ARMED, discard otherwise.
+ */
+ if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
+ ppd->neighbor_normal = 1;
+ break;
+ case SMA_IDLE_ACTIVE:
+ /*
+ * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+ * State Transitions
+ *
+ * Can activate the node. Discard otherwise.
+ */
+ if (ppd->host_link_state == HLS_UP_ARMED &&
+ ppd->is_active_optimize_enabled) {
+ ppd->neighbor_normal = 1;
+ ret = set_link_state(ppd, HLS_UP_ACTIVE);
+ if (ret)
+ dd_dev_err(
+ dd,
+ "%s: received Active SMA idle message, couldn't set link to Active\n",
+ __func__);
+ }
+ break;
+ default:
+ dd_dev_err(dd,
+ "%s: received unexpected SMA idle message 0x%llx\n",
+ __func__, msg);
+ break;
+ }
+}
+
+static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
+{
+ u64 rcvctrl;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->rcvctrl_lock, flags);
+ rcvctrl = read_csr(dd, RCV_CTRL);
+ rcvctrl |= add;
+ rcvctrl &= ~clear;
+ write_csr(dd, RCV_CTRL, rcvctrl);
+ spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
+}
+
+static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
+{
+ adjust_rcvctrl(dd, add, 0);
+}
+
+static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
+{
+ adjust_rcvctrl(dd, 0, clear);
+}
+
+/*
+ * Called from all interrupt handlers to start handling an SPC freeze.
+ */
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct send_context *sc;
+ int i;
+
+ if (flags & FREEZE_SELF)
+ write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+
+ /* enter frozen mode */
+ dd->flags |= HFI1_FROZEN;
+
+ /* notify all SDMA engines that they are going into a freeze */
+ sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
+
+ /* do halt pre-handling on all enabled send contexts */
+ for (i = 0; i < dd->num_send_contexts; i++) {
+ sc = dd->send_contexts[i].sc;
+ if (sc && (sc->flags & SCF_ENABLED))
+ sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+ }
+
+ /* Send context are frozen. Notify user space */
+ hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
+
+ if (flags & FREEZE_ABORT) {
+ dd_dev_err(dd,
+ "Aborted freeze recovery. Please REBOOT system\n");
+ return;
+ }
+ /* queue non-interrupt handler */
+ queue_work(ppd->hfi1_wq, &ppd->freeze_work);
+}
+
+/*
+ * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
+ * depending on the "freeze" parameter.
+ *
+ * No need to return an error if it times out, our only option
+ * is to proceed anyway.
+ */
+static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
+{
+ unsigned long timeout;
+ u64 reg;
+
+ timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
+ while (1) {
+ reg = read_csr(dd, CCE_STATUS);
+ if (freeze) {
+ /* waiting until all indicators are set */
+ if ((reg & ALL_FROZE) == ALL_FROZE)
+ return; /* all done */
+ } else {
+ /* waiting until all indicators are clear */
+ if ((reg & ALL_FROZE) == 0)
+ return; /* all done */
+ }
+
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(dd,
+ "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
+ freeze ? "" : "un", reg & ALL_FROZE,
+ freeze ? ALL_FROZE : 0ull);
+ return;
+ }
+ usleep_range(80, 120);
+ }
+}
+
+/*
+ * Do all freeze handling for the RXE block.
+ */
+static void rxe_freeze(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* disable port */
+ clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+ /* disable all receive contexts */
+ for (i = 0; i < dd->num_rcv_contexts; i++)
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
+}
+
+/*
+ * Unfreeze handling for the RXE block - kernel contexts only.
+ * This will also enable the port. User contexts will do unfreeze
+ * handling on a per-context basis as they call into the driver.
+ *
+ */
+static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+ u32 rcvmask;
+ int i;
+
+ /* enable all kernel contexts */
+ for (i = 0; i < dd->n_krcv_queues; i++) {
+ rcvmask = HFI1_RCVCTRL_CTXT_ENB;
+ /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
+ rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+ HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+ hfi1_rcvctrl(dd, rcvmask, i);
+ }
+
+ /* enable port */
+ add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+/*
+ * Non-interrupt SPC freeze handling.
+ *
+ * This is a work-queue function outside of the triggering interrupt.
+ */
+void handle_freeze(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ freeze_work);
+ struct hfi1_devdata *dd = ppd->dd;
+
+ /* wait for freeze indicators on all affected blocks */
+ wait_for_freeze_status(dd, 1);
+
+ /* SPC is now frozen */
+
+ /* do send PIO freeze steps */
+ pio_freeze(dd);
+
+ /* do send DMA freeze steps */
+ sdma_freeze(dd);
+
+ /* do send egress freeze steps - nothing to do */
+
+ /* do receive freeze steps */
+ rxe_freeze(dd);
+
+ /*
+ * Unfreeze the hardware - clear the freeze, wait for each
+ * block's frozen bit to clear, then clear the frozen flag.
+ */
+ write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+ wait_for_freeze_status(dd, 0);
+
+ if (is_ax(dd)) {
+ write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+ wait_for_freeze_status(dd, 1);
+ write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+ wait_for_freeze_status(dd, 0);
+ }
+
+ /* do send PIO unfreeze steps for kernel contexts */
+ pio_kernel_unfreeze(dd);
+
+ /* do send DMA unfreeze steps */
+ sdma_unfreeze(dd);
+
+ /* do send egress unfreeze steps - nothing to do */
+
+ /* do receive unfreeze steps for kernel contexts */
+ rxe_kernel_unfreeze(dd);
+
+ /*
+ * The unfreeze procedure touches global device registers when
+ * it disables and re-enables RXE. Mark the device unfrozen
+ * after all that is done so other parts of the driver waiting
+ * for the device to unfreeze don't do things out of order.
+ *
+ * The above implies that the meaning of HFI1_FROZEN flag is
+ * "Device has gone into freeze mode and freeze mode handling
+ * is still in progress."
+ *
+ * The flag will be removed when freeze mode processing has
+ * completed.
+ */
+ dd->flags &= ~HFI1_FROZEN;
+ wake_up(&dd->event_queue);
+
+ /* no longer frozen */
+}
+
+/*
+ * Handle a link up interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_up(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_up_work);
+ set_link_state(ppd, HLS_UP_INIT);
+
+ /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+ read_ltp_rtt(ppd->dd);
+ /*
+ * OPA specifies that certain counters are cleared on a transition
+ * to link up, so do that.
+ */
+ clear_linkup_counters(ppd->dd);
+ /*
+ * And (re)set link up default values.
+ */
+ set_linkup_defaults(ppd);
+
+ /* enforce link speed enabled */
+ if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
+ /* oops - current speed is not enabled, bounce */
+ dd_dev_err(ppd->dd,
+ "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
+ ppd->link_speed_active, ppd->link_speed_enabled);
+ set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
+ OPA_LINKDOWN_REASON_SPEED_POLICY);
+ set_link_state(ppd, HLS_DN_OFFLINE);
+ tune_serdes(ppd);
+ start_link(ppd);
+ }
+}
+
+/*
+ * Several pieces of LNI information were cached for SMA in ppd.
+ * Reset these on link down
+ */
+static void reset_neighbor_info(struct hfi1_pportdata *ppd)
+{
+ ppd->neighbor_guid = 0;
+ ppd->neighbor_port_number = 0;
+ ppd->neighbor_type = 0;
+ ppd->neighbor_fm_security = 0;
+}
+
+static const char * const link_down_reason_strs[] = {
+ [OPA_LINKDOWN_REASON_NONE] = "None",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+ [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+ [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+ [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+ [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+ [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+ [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+ [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+ [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+ [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+ [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+ [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+ [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+ [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+ [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+ [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+ [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+ [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+ [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+ [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+ [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+ [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+ "Excessive buffer overrun",
+ [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+ [OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+ [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+ [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+ [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+ [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+ [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+ [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+ "Local media not installed",
+ [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+ [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+ [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+ "End to end not installed",
+ [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+ [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+ [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+ [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+ [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+ [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+ const char *str = NULL;
+
+ if (reason < ARRAY_SIZE(link_down_reason_strs))
+ str = link_down_reason_strs[reason];
+ if (!str)
+ str = "(invalid)";
+
+ return str;
+}
+
+/*
+ * Handle a link down interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_down(struct work_struct *work)
+{
+ u8 lcl_reason, neigh_reason = 0;
+ u8 link_down_reason;
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_down_work);
+ int was_up;
+ static const char ldr_str[] = "Link down reason: ";
+
+ if ((ppd->host_link_state &
+ (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
+ ppd->port_type == PORT_TYPE_FIXED)
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
+
+ /* Go offline first, then deal with reading/writing through 8051 */
+ was_up = !!(ppd->host_link_state & HLS_UP);
+ set_link_state(ppd, HLS_DN_OFFLINE);
+
+ if (was_up) {
+ lcl_reason = 0;
+ /* link down reason is only valid if the link was up */
+ read_link_down_reason(ppd->dd, &link_down_reason);
+ switch (link_down_reason) {
+ case LDR_LINK_TRANSFER_ACTIVE_LOW:
+ /* the link went down, no idle message reason */
+ dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+ ldr_str);
+ break;
+ case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+ /*
+ * The neighbor reason is only valid if an idle message
+ * was received for it.
+ */
+ read_planned_down_reason_code(ppd->dd, &neigh_reason);
+ dd_dev_info(ppd->dd,
+ "%sNeighbor link down message %d, %s\n",
+ ldr_str, neigh_reason,
+ link_down_reason_str(neigh_reason));
+ break;
+ case LDR_RECEIVED_HOST_OFFLINE_REQ:
+ dd_dev_info(ppd->dd,
+ "%sHost requested link to go offline\n",
+ ldr_str);
+ break;
+ default:
+ dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+ ldr_str, link_down_reason);
+ break;
+ }
+
+ /*
+ * If no reason, assume peer-initiated but missed
+ * LinkGoingDown idle flits.
+ */
+ if (neigh_reason == 0)
+ lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+ } else {
+ /* went down while polling or going up */
+ lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+ }
+
+ set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+
+ /* inform the SMA when the link transitions from up to down */
+ if (was_up && ppd->local_link_down_reason.sma == 0 &&
+ ppd->neigh_link_down_reason.sma == 0) {
+ ppd->local_link_down_reason.sma =
+ ppd->local_link_down_reason.latest;
+ ppd->neigh_link_down_reason.sma =
+ ppd->neigh_link_down_reason.latest;
+ }
+
+ reset_neighbor_info(ppd);
+
+ /* disable the port */
+ clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+ /*
+ * If there is no cable attached, turn the DC off. Otherwise,
+ * start the link bring up.
+ */
+ if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
+ dc_shutdown(ppd->dd);
+ } else {
+ tune_serdes(ppd);
+ start_link(ppd);
+ }
+}
+
+void handle_link_bounce(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_bounce_work);
+
+ /*
+ * Only do something if the link is currently up.
+ */
+ if (ppd->host_link_state & HLS_UP) {
+ set_link_state(ppd, HLS_DN_OFFLINE);
+ tune_serdes(ppd);
+ start_link(ppd);
+ } else {
+ dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
+ __func__, link_state_name(ppd->host_link_state));
+ }
+}
+
+/*
+ * Mask conversion: Capability exchange to Port LTP. The capability
+ * exchange has an implicit 16b CRC that is mandatory.
+ */
+static int cap_to_port_ltp(int cap)
+{
+ int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
+
+ if (cap & CAP_CRC_14B)
+ port_ltp |= PORT_LTP_CRC_MODE_14;
+ if (cap & CAP_CRC_48B)
+ port_ltp |= PORT_LTP_CRC_MODE_48;
+ if (cap & CAP_CRC_12B_16B_PER_LANE)
+ port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
+
+ return port_ltp;
+}
+
+/*
+ * Convert an OPA Port LTP mask to capability mask
+ */
+int port_ltp_to_cap(int port_ltp)
+{
+ int cap_mask = 0;
+
+ if (port_ltp & PORT_LTP_CRC_MODE_14)
+ cap_mask |= CAP_CRC_14B;
+ if (port_ltp & PORT_LTP_CRC_MODE_48)
+ cap_mask |= CAP_CRC_48B;
+ if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
+ cap_mask |= CAP_CRC_12B_16B_PER_LANE;
+
+ return cap_mask;
+}
+
+/*
+ * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
+ */
+static int lcb_to_port_ltp(int lcb_crc)
+{
+ int port_ltp = 0;
+
+ if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
+ port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
+ else if (lcb_crc == LCB_CRC_48B)
+ port_ltp = PORT_LTP_CRC_MODE_48;
+ else if (lcb_crc == LCB_CRC_14B)
+ port_ltp = PORT_LTP_CRC_MODE_14;
+ else
+ port_ltp = PORT_LTP_CRC_MODE_16;
+
+ return port_ltp;
+}
+
+/*
+ * Our neighbor has indicated that we are allowed to act as a fabric
+ * manager, so place the full management partition key in the second
+ * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
+ * that we should already have the limited management partition key in
+ * array element 1, and also that the port is not yet up when
+ * add_full_mgmt_pkey() is invoked.
+ */
+static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ /* Sanity check - ppd->pkeys[2] should be 0, or already initalized */
+ if (!((ppd->pkeys[2] == 0) || (ppd->pkeys[2] == FULL_MGMT_P_KEY)))
+ dd_dev_warn(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
+ __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
+ ppd->pkeys[2] = FULL_MGMT_P_KEY;
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+ hfi1_event_pkey_change(ppd->dd, ppd->port);
+}
+
+static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+ if (ppd->pkeys[2] != 0) {
+ ppd->pkeys[2] = 0;
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+ hfi1_event_pkey_change(ppd->dd, ppd->port);
+ }
+}
+
+/*
+ * Convert the given link width to the OPA link width bitmask.
+ */
+static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
+{
+ switch (width) {
+ case 0:
+ /*
+ * Simulator and quick linkup do not set the width.
+ * Just set it to 4x without complaint.
+ */
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
+ return OPA_LINK_WIDTH_4X;
+ return 0; /* no lanes up */
+ case 1: return OPA_LINK_WIDTH_1X;
+ case 2: return OPA_LINK_WIDTH_2X;
+ case 3: return OPA_LINK_WIDTH_3X;
+ default:
+ dd_dev_info(dd, "%s: invalid width %d, using 4\n",
+ __func__, width);
+ /* fall through */
+ case 4: return OPA_LINK_WIDTH_4X;
+ }
+}
+
+/*
+ * Do a population count on the bottom nibble.
+ */
+static const u8 bit_counts[16] = {
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
+};
+
+static inline u8 nibble_to_count(u8 nibble)
+{
+ return bit_counts[nibble & 0xf];
+}
+
+/*
+ * Read the active lane information from the 8051 registers and return
+ * their widths.
+ *
+ * Active lane information is found in these 8051 registers:
+ * enable_lane_tx
+ * enable_lane_rx
+ */
+static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
+ u16 *rx_width)
+{
+ u16 tx, rx;
+ u8 enable_lane_rx;
+ u8 enable_lane_tx;
+ u8 tx_polarity_inversion;
+ u8 rx_polarity_inversion;
+ u8 max_rate;
+
+ /* read the active lanes */
+ read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+ &rx_polarity_inversion, &max_rate);
+ read_local_lni(dd, &enable_lane_rx);
+
+ /* convert to counts */
+ tx = nibble_to_count(enable_lane_tx);
+ rx = nibble_to_count(enable_lane_rx);
+
+ /*
+ * Set link_speed_active here, overriding what was set in
+ * handle_verify_cap(). The ASIC 8051 firmware does not correctly
+ * set the max_rate field in handle_verify_cap until v0.19.
+ */
+ if ((dd->icode == ICODE_RTL_SILICON) &&
+ (dd->dc8051_ver < dc8051_ver(0, 19))) {
+ /* max_rate: 0 = 12.5G, 1 = 25G */
+ switch (max_rate) {
+ case 0:
+ dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
+ break;
+ default:
+ dd_dev_err(dd,
+ "%s: unexpected max rate %d, using 25Gb\n",
+ __func__, (int)max_rate);
+ /* fall through */
+ case 1:
+ dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+ break;
+ }
+ }
+
+ dd_dev_info(dd,
+ "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
+ enable_lane_tx, tx, enable_lane_rx, rx);
+ *tx_width = link_width_to_bits(dd, tx);
+ *rx_width = link_width_to_bits(dd, rx);
+}
+
+/*
+ * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
+ * Valid after the end of VerifyCap and during LinkUp. Does not change
+ * after link up. I.e. look elsewhere for downgrade information.
+ *
+ * Bits are:
+ * + bits [7:4] contain the number of active transmitters
+ * + bits [3:0] contain the number of active receivers
+ * These are numbers 1 through 4 and can be different values if the
+ * link is asymmetric.
+ *
+ * verify_cap_local_fm_link_width[0] retains its original value.
+ */
+static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
+ u16 *rx_width)
+{
+ u16 widths, tx, rx;
+ u8 misc_bits, local_flags;
+ u16 active_tx, active_rx;
+
+ read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
+ tx = widths >> 12;
+ rx = (widths >> 8) & 0xf;
+
+ *tx_width = link_width_to_bits(dd, tx);
+ *rx_width = link_width_to_bits(dd, rx);
+
+ /* print the active widths */
+ get_link_widths(dd, &active_tx, &active_rx);
+}
+
+/*
+ * Set ppd->link_width_active and ppd->link_width_downgrade_active using
+ * hardware information when the link first comes up.
+ *
+ * The link width is not available until after VerifyCap.AllFramesReceived
+ * (the trigger for handle_verify_cap), so this is outside that routine
+ * and should be called when the 8051 signals linkup.
+ */
+void get_linkup_link_widths(struct hfi1_pportdata *ppd)
+{
+ u16 tx_width, rx_width;
+
+ /* get end-of-LNI link widths */
+ get_linkup_widths(ppd->dd, &tx_width, &rx_width);
+
+ /* use tx_width as the link is supposed to be symmetric on link up */
+ ppd->link_width_active = tx_width;
+ /* link width downgrade active (LWD.A) starts out matching LW.A */
+ ppd->link_width_downgrade_tx_active = ppd->link_width_active;
+ ppd->link_width_downgrade_rx_active = ppd->link_width_active;
+ /* per OPA spec, on link up LWD.E resets to LWD.S */
+ ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
+ /* cache the active egress rate (units {10^6 bits/sec]) */
+ ppd->current_egress_rate = active_egress_rate(ppd);
+}
+
+/*
+ * Handle a verify capabilities interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_verify_cap(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_vc_work);
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 reg;
+ u8 power_management;
+ u8 continious;
+ u8 vcu;
+ u8 vau;
+ u8 z;
+ u16 vl15buf;
+ u16 link_widths;
+ u16 crc_mask;
+ u16 crc_val;
+ u16 device_id;
+ u16 active_tx, active_rx;
+ u8 partner_supported_crc;
+ u8 remote_tx_rate;
+ u8 device_rev;
+
+ set_link_state(ppd, HLS_VERIFY_CAP);
+
+ lcb_shutdown(dd, 0);
+ adjust_lcb_for_fpga_serdes(dd);
+
+ /*
+ * These are now valid:
+ * remote VerifyCap fields in the general LNI config
+ * CSR DC8051_STS_REMOTE_GUID
+ * CSR DC8051_STS_REMOTE_NODE_TYPE
+ * CSR DC8051_STS_REMOTE_FM_SECURITY
+ * CSR DC8051_STS_REMOTE_PORT_NO
+ */
+
+ read_vc_remote_phy(dd, &power_management, &continious);
+ read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf,
+ &partner_supported_crc);
+ read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
+ read_remote_device_id(dd, &device_id, &device_rev);
+ /*
+ * And the 'MgmtAllowed' information, which is exchanged during
+ * LNI, is also be available at this point.
+ */
+ read_mgmt_allowed(dd, &ppd->mgmt_allowed);
+ /* print the active widths */
+ get_link_widths(dd, &active_tx, &active_rx);
+ dd_dev_info(dd,
+ "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
+ (int)power_management, (int)continious);
+ dd_dev_info(dd,
+ "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
+ (int)vau, (int)z, (int)vcu, (int)vl15buf,
+ (int)partner_supported_crc);
+ dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
+ (u32)remote_tx_rate, (u32)link_widths);
+ dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
+ (u32)device_id, (u32)device_rev);
+ /*
+ * The peer vAU value just read is the peer receiver value. HFI does
+ * not support a transmit vAU of 0 (AU == 8). We advertised that
+ * with Z=1 in the fabric capabilities sent to the peer. The peer
+ * will see our Z=1, and, if it advertised a vAU of 0, will move its
+ * receive to vAU of 1 (AU == 16). Do the same here. We do not care
+ * about the peer Z value - our sent vAU is 3 (hardwired) and is not
+ * subject to the Z value exception.
+ */
+ if (vau == 0)
+ vau = 1;
+ set_up_vl15(dd, vau, vl15buf);
+
+ /* set up the LCB CRC mode */
+ crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
+
+ /* order is important: use the lowest bit in common */
+ if (crc_mask & CAP_CRC_14B)
+ crc_val = LCB_CRC_14B;
+ else if (crc_mask & CAP_CRC_48B)
+ crc_val = LCB_CRC_48B;
+ else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
+ crc_val = LCB_CRC_12B_16B_PER_LANE;
+ else
+ crc_val = LCB_CRC_16B;
+
+ dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
+ write_csr(dd, DC_LCB_CFG_CRC_MODE,
+ (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
+
+ /* set (14b only) or clear sideband credit */
+ reg = read_csr(dd, SEND_CM_CTRL);
+ if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
+ write_csr(dd, SEND_CM_CTRL,
+ reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+ } else {
+ write_csr(dd, SEND_CM_CTRL,
+ reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+ }
+
+ ppd->link_speed_active = 0; /* invalid value */
+ if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+ /* remote_tx_rate: 0 = 12.5G, 1 = 25G */
+ switch (remote_tx_rate) {
+ case 0:
+ ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+ break;
+ case 1:
+ ppd->link_speed_active = OPA_LINK_SPEED_25G;
+ break;
+ }
+ } else {
+ /* actual rate is highest bit of the ANDed rates */
+ u8 rate = remote_tx_rate & ppd->local_tx_rate;
+
+ if (rate & 2)
+ ppd->link_speed_active = OPA_LINK_SPEED_25G;
+ else if (rate & 1)
+ ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+ }
+ if (ppd->link_speed_active == 0) {
+ dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
+ __func__, (int)remote_tx_rate);
+ ppd->link_speed_active = OPA_LINK_SPEED_25G;
+ }
+
+ /*
+ * Cache the values of the supported, enabled, and active
+ * LTP CRC modes to return in 'portinfo' queries. But the bit
+ * flags that are returned in the portinfo query differ from
+ * what's in the link_crc_mask, crc_sizes, and crc_val
+ * variables. Convert these here.
+ */
+ ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+ /* supported crc modes */
+ ppd->port_ltp_crc_mode |=
+ cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
+ /* enabled crc modes */
+ ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
+ /* active crc mode */
+
+ /* set up the remote credit return table */
+ assign_remote_cm_au_table(dd, vcu);
+
+ /*
+ * The LCB is reset on entry to handle_verify_cap(), so this must
+ * be applied on every link up.
+ *
+ * Adjust LCB error kill enable to kill the link if
+ * these RBUF errors are seen:
+ * REPLAY_BUF_MBE_SMASK
+ * FLIT_INPUT_BUF_MBE_SMASK
+ */
+ if (is_ax(dd)) { /* fixed in B0 */
+ reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
+ reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
+ | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
+ write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
+ }
+
+ /* pull LCB fifos out of reset - all fifo clocks must be stable */
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+ /* give 8051 access to the LCB CSRs */
+ write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+ set_8051_lcb_access(dd);
+
+ ppd->neighbor_guid =
+ read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+ ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+ DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+ ppd->neighbor_type =
+ read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+ DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+ ppd->neighbor_fm_security =
+ read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
+ DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
+ dd_dev_info(dd,
+ "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
+ ppd->neighbor_guid, ppd->neighbor_type,
+ ppd->mgmt_allowed, ppd->neighbor_fm_security);
+ if (ppd->mgmt_allowed)
+ add_full_mgmt_pkey(ppd);
+
+ /* tell the 8051 to go to LinkUp */
+ set_link_state(ppd, HLS_GOING_UP);
+}
+
+/*
+ * Apply the link width downgrade enabled policy against the current active
+ * link widths.
+ *
+ * Called when the enabled policy changes or the active link widths change.
+ */
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
+{
+ int do_bounce = 0;
+ int tries;
+ u16 lwde;
+ u16 tx, rx;
+
+ /* use the hls lock to avoid a race with actual link up */
+ tries = 0;
+retry:
+ mutex_lock(&ppd->hls_lock);
+ /* only apply if the link is up */
+ if (ppd->host_link_state & HLS_DOWN) {
+ /* still going up..wait and retry */
+ if (ppd->host_link_state & HLS_GOING_UP) {
+ if (++tries < 1000) {
+ mutex_unlock(&ppd->hls_lock);
+ usleep_range(100, 120); /* arbitrary */
+ goto retry;
+ }
+ dd_dev_err(ppd->dd,
+ "%s: giving up waiting for link state change\n",
+ __func__);
+ }
+ goto done;
+ }
+
+ lwde = ppd->link_width_downgrade_enabled;
+
+ if (refresh_widths) {
+ get_link_widths(ppd->dd, &tx, &rx);
+ ppd->link_width_downgrade_tx_active = tx;
+ ppd->link_width_downgrade_rx_active = rx;
+ }
+
+ if (ppd->link_width_downgrade_tx_active == 0 ||
+ ppd->link_width_downgrade_rx_active == 0) {
+ /* the 8051 reported a dead link as a downgrade */
+ dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+ } else if (lwde == 0) {
+ /* downgrade is disabled */
+
+ /* bounce if not at starting active width */
+ if ((ppd->link_width_active !=
+ ppd->link_width_downgrade_tx_active) ||
+ (ppd->link_width_active !=
+ ppd->link_width_downgrade_rx_active)) {
+ dd_dev_err(ppd->dd,
+ "Link downgrade is disabled and link has downgraded, downing link\n");
+ dd_dev_err(ppd->dd,
+ " original 0x%x, tx active 0x%x, rx active 0x%x\n",
+ ppd->link_width_active,
+ ppd->link_width_downgrade_tx_active,
+ ppd->link_width_downgrade_rx_active);
+ do_bounce = 1;
+ }
+ } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 ||
+ (lwde & ppd->link_width_downgrade_rx_active) == 0) {
+ /* Tx or Rx is outside the enabled policy */
+ dd_dev_err(ppd->dd,
+ "Link is outside of downgrade allowed, downing link\n");
+ dd_dev_err(ppd->dd,
+ " enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
+ lwde, ppd->link_width_downgrade_tx_active,
+ ppd->link_width_downgrade_rx_active);
+ do_bounce = 1;
+ }
+
+done:
+ mutex_unlock(&ppd->hls_lock);
+
+ if (do_bounce) {
+ set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
+ OPA_LINKDOWN_REASON_WIDTH_POLICY);
+ set_link_state(ppd, HLS_DN_OFFLINE);
+ tune_serdes(ppd);
+ start_link(ppd);
+ }
+}
+
+/*
+ * Handle a link downgrade interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_downgrade(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ link_downgrade_work);
+
+ dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
+ apply_link_downgrade_policy(ppd, 1);
+}
+
+static char *dcc_err_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, dcc_err_flags,
+ ARRAY_SIZE(dcc_err_flags));
+}
+
+static char *lcb_err_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, lcb_err_flags,
+ ARRAY_SIZE(lcb_err_flags));
+}
+
+static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, dc8051_err_flags,
+ ARRAY_SIZE(dc8051_err_flags));
+}
+
+static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
+ ARRAY_SIZE(dc8051_info_err_flags));
+}
+
+static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
+{
+ return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
+ ARRAY_SIZE(dc8051_info_host_msg_flags));
+}
+
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+ u64 info, err, host_msg;
+ int queue_link_down = 0;
+ char buf[96];
+
+ /* look at the flags */
+ if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
+ /* 8051 information set by firmware */
+ /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
+ info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
+ err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
+ & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
+ host_msg = (info >>
+ DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
+ & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
+
+ /*
+ * Handle error flags.
+ */
+ if (err & FAILED_LNI) {
+ /*
+ * LNI error indications are cleared by the 8051
+ * only when starting polling. Only pay attention
+ * to them when in the states that occur during
+ * LNI.
+ */
+ if (ppd->host_link_state
+ & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+ queue_link_down = 1;
+ dd_dev_info(dd, "Link error: %s\n",
+ dc8051_info_err_string(buf,
+ sizeof(buf),
+ err &
+ FAILED_LNI));
+ }
+ err &= ~(u64)FAILED_LNI;
+ }
+ /* unknown frames can happen durning LNI, just count */
+ if (err & UNKNOWN_FRAME) {
+ ppd->unknown_frame_count++;
+ err &= ~(u64)UNKNOWN_FRAME;
+ }
+ if (err) {
+ /* report remaining errors, but do not do anything */
+ dd_dev_err(dd, "8051 info error: %s\n",
+ dc8051_info_err_string(buf, sizeof(buf),
+ err));
+ }
+
+ /*
+ * Handle host message flags.
+ */
+ if (host_msg & HOST_REQ_DONE) {
+ /*
+ * Presently, the driver does a busy wait for
+ * host requests to complete. This is only an
+ * informational message.
+ * NOTE: The 8051 clears the host message
+ * information *on the next 8051 command*.
+ * Therefore, when linkup is achieved,
+ * this flag will still be set.
+ */
+ host_msg &= ~(u64)HOST_REQ_DONE;
+ }
+ if (host_msg & BC_SMA_MSG) {
+ queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
+ host_msg &= ~(u64)BC_SMA_MSG;
+ }
+ if (host_msg & LINKUP_ACHIEVED) {
+ dd_dev_info(dd, "8051: Link up\n");
+ queue_work(ppd->hfi1_wq, &ppd->link_up_work);
+ host_msg &= ~(u64)LINKUP_ACHIEVED;
+ }
+ if (host_msg & EXT_DEVICE_CFG_REQ) {
+ handle_8051_request(ppd);
+ host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
+ }
+ if (host_msg & VERIFY_CAP_FRAME) {
+ queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
+ host_msg &= ~(u64)VERIFY_CAP_FRAME;
+ }
+ if (host_msg & LINK_GOING_DOWN) {
+ const char *extra = "";
+ /* no downgrade action needed if going down */
+ if (host_msg & LINK_WIDTH_DOWNGRADED) {
+ host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+ extra = " (ignoring downgrade)";
+ }
+ dd_dev_info(dd, "8051: Link down%s\n", extra);
+ queue_link_down = 1;
+ host_msg &= ~(u64)LINK_GOING_DOWN;
+ }
+ if (host_msg & LINK_WIDTH_DOWNGRADED) {
+ queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
+ host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+ }
+ if (host_msg) {
+ /* report remaining messages, but do not do anything */
+ dd_dev_info(dd, "8051 info host message: %s\n",
+ dc8051_info_host_msg_string(buf,
+ sizeof(buf),
+ host_msg));
+ }
+
+ reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
+ }
+ if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
+ /*
+ * Lost the 8051 heartbeat. If this happens, we
+ * receive constant interrupts about it. Disable
+ * the interrupt after the first.
+ */
+ dd_dev_err(dd, "Lost 8051 heartbeat\n");
+ write_csr(dd, DC_DC8051_ERR_EN,
+ read_csr(dd, DC_DC8051_ERR_EN) &
+ ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
+
+ reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
+ }
+ if (reg) {
+ /* report the error, but do not do anything */
+ dd_dev_err(dd, "8051 error: %s\n",
+ dc8051_err_string(buf, sizeof(buf), reg));
+ }
+
+ if (queue_link_down) {
+ /*
+ * if the link is already going down or disabled, do not
+ * queue another
+ */
+ if ((ppd->host_link_state &
+ (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) ||
+ ppd->link_enabled == 0) {
+ dd_dev_info(dd, "%s: not queuing link down\n",
+ __func__);
+ } else {
+ queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+ }
+ }
+}
+
+static const char * const fm_config_txt[] = {
+[0] =
+ "BadHeadDist: Distance violation between two head flits",
+[1] =
+ "BadTailDist: Distance violation between two tail flits",
+[2] =
+ "BadCtrlDist: Distance violation between two credit control flits",
+[3] =
+ "BadCrdAck: Credits return for unsupported VL",
+[4] =
+ "UnsupportedVLMarker: Received VL Marker",
+[5] =
+ "BadPreempt: Exceeded the preemption nesting level",
+[6] =
+ "BadControlFlit: Received unsupported control flit",
+/* no 7 */
+[8] =
+ "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
+};
+
+static const char * const port_rcv_txt[] = {
+[1] =
+ "BadPktLen: Illegal PktLen",
+[2] =
+ "PktLenTooLong: Packet longer than PktLen",
+[3] =
+ "PktLenTooShort: Packet shorter than PktLen",
+[4] =
+ "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
+[5] =
+ "BadDLID: Illegal DLID (0, doesn't match HFI)",
+[6] =
+ "BadL2: Illegal L2 opcode",
+[7] =
+ "BadSC: Unsupported SC",
+[9] =
+ "BadRC: Illegal RC",
+[11] =
+ "PreemptError: Preempting with same VL",
+[12] =
+ "PreemptVL15: Preempting a VL15 packet",
+};
+
+#define OPA_LDR_FMCONFIG_OFFSET 16
+#define OPA_LDR_PORTRCV_OFFSET 0
+static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ u64 info, hdr0, hdr1;
+ const char *extra;
+ char buf[96];
+ struct hfi1_pportdata *ppd = dd->pport;
+ u8 lcl_reason = 0;
+ int do_bounce = 0;
+
+ if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
+ if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
+ info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
+ dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
+ /* set status bit */
+ dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
+ }
+ reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
+ }
+
+ if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
+ struct hfi1_pportdata *ppd = dd->pport;
+ /* this counter saturates at (2^32) - 1 */
+ if (ppd->link_downed < (u32)UINT_MAX)
+ ppd->link_downed++;
+ reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
+ }
+
+ if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
+ u8 reason_valid = 1;
+
+ info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
+ if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
+ dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
+ /* set status bit */
+ dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
+ }
+ switch (info) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ extra = fm_config_txt[info];
+ break;
+ case 8:
+ extra = fm_config_txt[info];
+ if (ppd->port_error_action &
+ OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
+ do_bounce = 1;
+ /*
+ * lcl_reason cannot be derived from info
+ * for this error
+ */
+ lcl_reason =
+ OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
+ }
+ break;
+ default:
+ reason_valid = 0;
+ snprintf(buf, sizeof(buf), "reserved%lld", info);
+ extra = buf;
+ break;
+ }
+
+ if (reason_valid && !do_bounce) {
+ do_bounce = ppd->port_error_action &
+ (1 << (OPA_LDR_FMCONFIG_OFFSET + info));
+ lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
+ }
+
+ /* just report this */
+ dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
+ reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
+ }
+
+ if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
+ u8 reason_valid = 1;
+
+ info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
+ hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
+ hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
+ if (!(dd->err_info_rcvport.status_and_code &
+ OPA_EI_STATUS_SMASK)) {
+ dd->err_info_rcvport.status_and_code =
+ info & OPA_EI_CODE_SMASK;
+ /* set status bit */
+ dd->err_info_rcvport.status_and_code |=
+ OPA_EI_STATUS_SMASK;
+ /*
+ * save first 2 flits in the packet that caused
+ * the error
+ */
+ dd->err_info_rcvport.packet_flit1 = hdr0;
+ dd->err_info_rcvport.packet_flit2 = hdr1;
+ }
+ switch (info) {
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ case 9:
+ case 11:
+ case 12:
+ extra = port_rcv_txt[info];
+ break;
+ default:
+ reason_valid = 0;
+ snprintf(buf, sizeof(buf), "reserved%lld", info);
+ extra = buf;
+ break;
+ }
+
+ if (reason_valid && !do_bounce) {
+ do_bounce = ppd->port_error_action &
+ (1 << (OPA_LDR_PORTRCV_OFFSET + info));
+ lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
+ }
+
+ /* just report this */
+ dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
+ dd_dev_info(dd, " hdr0 0x%llx, hdr1 0x%llx\n",
+ hdr0, hdr1);
+
+ reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
+ }
+
+ if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
+ /* informative only */
+ dd_dev_info(dd, "8051 access to LCB blocked\n");
+ reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
+ }
+ if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
+ /* informative only */
+ dd_dev_info(dd, "host access to LCB blocked\n");
+ reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
+ }
+
+ /* report any remaining errors */
+ if (reg)
+ dd_dev_info(dd, "DCC Error: %s\n",
+ dcc_err_string(buf, sizeof(buf), reg));
+
+ if (lcl_reason == 0)
+ lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
+
+ if (do_bounce) {
+ dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+ set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
+ queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+ }
+}
+
+static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+ char buf[96];
+
+ dd_dev_info(dd, "LCB Error: %s\n",
+ lcb_err_string(buf, sizeof(buf), reg));
+}
+
+/*
+ * CCE block DC interrupt. Source is < 8.
+ */
+static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ const struct err_reg_info *eri = &dc_errs[source];
+
+ if (eri->handler) {
+ interrupt_clear_down(dd, 0, eri);
+ } else if (source == 3 /* dc_lbm_int */) {
+ /*
+ * This indicates that a parity error has occurred on the
+ * address/control lines presented to the LBM. The error
+ * is a single pulse, there is no associated error flag,
+ * and it is non-maskable. This is because if a parity
+ * error occurs on the request the request is dropped.
+ * This should never occur, but it is nice to know if it
+ * ever does.
+ */
+ dd_dev_err(dd, "Parity error in DC LBM block\n");
+ } else {
+ dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
+ }
+}
+
+/*
+ * TX block send credit interrupt. Source is < 160.
+ */
+static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ sc_group_release_update(dd, source);
+}
+
+/*
+ * TX block SDMA interrupt. Source is < 48.
+ *
+ * SDMA interrupts are grouped by type:
+ *
+ * 0 - N-1 = SDma
+ * N - 2N-1 = SDmaProgress
+ * 2N - 3N-1 = SDmaIdle
+ */
+static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ /* what interrupt */
+ unsigned int what = source / TXE_NUM_SDMA_ENGINES;
+ /* which engine */
+ unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
+ slashstrip(__FILE__), __LINE__, __func__);
+ sdma_dumpstate(&dd->per_sdma[which]);
+#endif
+
+ if (likely(what < 3 && which < dd->num_sdma)) {
+ sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
+ } else {
+ /* should not happen */
+ dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
+ }
+}
+
+/*
+ * RX block receive available interrupt. Source is < 160.
+ */
+static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ struct hfi1_ctxtdata *rcd;
+ char *err_detail;
+
+ if (likely(source < dd->num_rcv_contexts)) {
+ rcd = dd->rcd[source];
+ if (rcd) {
+ if (source < dd->first_user_ctxt)
+ rcd->do_interrupt(rcd, 0);
+ else
+ handle_user_interrupt(rcd);
+ return; /* OK */
+ }
+ /* received an interrupt, but no rcd */
+ err_detail = "dataless";
+ } else {
+ /* received an interrupt, but are not using that context */
+ err_detail = "out of range";
+ }
+ dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
+ err_detail, source);
+}
+
+/*
+ * RX block receive urgent interrupt. Source is < 160.
+ */
+static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ struct hfi1_ctxtdata *rcd;
+ char *err_detail;
+
+ if (likely(source < dd->num_rcv_contexts)) {
+ rcd = dd->rcd[source];
+ if (rcd) {
+ /* only pay attention to user urgent interrupts */
+ if (source >= dd->first_user_ctxt)
+ handle_user_interrupt(rcd);
+ return; /* OK */
+ }
+ /* received an interrupt, but no rcd */
+ err_detail = "dataless";
+ } else {
+ /* received an interrupt, but are not using that context */
+ err_detail = "out of range";
+ }
+ dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
+ err_detail, source);
+}
+
+/*
+ * Reserved range interrupt. Should not be called in normal operation.
+ */
+static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
+{
+ char name[64];
+
+ dd_dev_err(dd, "unexpected %s interrupt\n",
+ is_reserved_name(name, sizeof(name), source));
+}
+
+static const struct is_table is_table[] = {
+/*
+ * start end
+ * name func interrupt func
+ */
+{ IS_GENERAL_ERR_START, IS_GENERAL_ERR_END,
+ is_misc_err_name, is_misc_err_int },
+{ IS_SDMAENG_ERR_START, IS_SDMAENG_ERR_END,
+ is_sdma_eng_err_name, is_sdma_eng_err_int },
+{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
+ is_sendctxt_err_name, is_sendctxt_err_int },
+{ IS_SDMA_START, IS_SDMA_END,
+ is_sdma_eng_name, is_sdma_eng_int },
+{ IS_VARIOUS_START, IS_VARIOUS_END,
+ is_various_name, is_various_int },
+{ IS_DC_START, IS_DC_END,
+ is_dc_name, is_dc_int },
+{ IS_RCVAVAIL_START, IS_RCVAVAIL_END,
+ is_rcv_avail_name, is_rcv_avail_int },
+{ IS_RCVURGENT_START, IS_RCVURGENT_END,
+ is_rcv_urgent_name, is_rcv_urgent_int },
+{ IS_SENDCREDIT_START, IS_SENDCREDIT_END,
+ is_send_credit_name, is_send_credit_int},
+{ IS_RESERVED_START, IS_RESERVED_END,
+ is_reserved_name, is_reserved_int},
+};
+
+/*
+ * Interrupt source interrupt - called when the given source has an interrupt.
+ * Source is a bit index into an array of 64-bit integers.
+ */
+static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
+{
+ const struct is_table *entry;
+
+ /* avoids a double compare by walking the table in-order */
+ for (entry = &is_table[0]; entry->is_name; entry++) {
+ if (source < entry->end) {
+ trace_hfi1_interrupt(dd, entry, source);
+ entry->is_int(dd, source - entry->start);
+ return;
+ }
+ }
+ /* fell off the end */
+ dd_dev_err(dd, "invalid interrupt source %u\n", source);
+}
+
+/*
+ * General interrupt handler. This is able to correctly handle
+ * all interrupts in case INTx is used.
+ */
+static irqreturn_t general_interrupt(int irq, void *data)
+{
+ struct hfi1_devdata *dd = data;
+ u64 regs[CCE_NUM_INT_CSRS];
+ u32 bit;
+ int i;
+
+ this_cpu_inc(*dd->int_counter);
+
+ /* phase 1: scan and clear all handled interrupts */
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+ if (dd->gi_mask[i] == 0) {
+ regs[i] = 0; /* used later */
+ continue;
+ }
+ regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
+ dd->gi_mask[i];
+ /* only clear if anything is set */
+ if (regs[i])
+ write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
+ }
+
+ /* phase 2: call the appropriate handler */
+ for_each_set_bit(bit, (unsigned long *)&regs[0],
+ CCE_NUM_INT_CSRS * 64) {
+ is_interrupt(dd, bit);
+ }
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t sdma_interrupt(int irq, void *data)
+{
+ struct sdma_engine *sde = data;
+ struct hfi1_devdata *dd = sde->dd;
+ u64 status;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+ slashstrip(__FILE__), __LINE__, __func__);
+ sdma_dumpstate(sde);
+#endif
+
+ this_cpu_inc(*dd->int_counter);
+
+ /* This read_csr is really bad in the hot path */
+ status = read_csr(dd,
+ CCE_INT_STATUS + (8 * (IS_SDMA_START / 64)))
+ & sde->imask;
+ if (likely(status)) {
+ /* clear the interrupt(s) */
+ write_csr(dd,
+ CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)),
+ status);
+
+ /* handle the interrupt(s) */
+ sdma_engine_interrupt(sde, status);
+ } else
+ dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
+ sde->this_idx);
+
+ return IRQ_HANDLED;
+}
+
+/*
+ * Clear the receive interrupt. Use a read of the interrupt clear CSR
+ * to insure that the write completed. This does NOT guarantee that
+ * queued DMA writes to memory from the chip are pushed.
+ */
+static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
+
+ mmiowb(); /* make sure everything before is written */
+ write_csr(dd, addr, rcd->imask);
+ /* force the above write on the chip and get a value back */
+ (void)read_csr(dd, addr);
+}
+
+/* force the receive interrupt */
+void force_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+ write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask);
+}
+
+/*
+ * Return non-zero if a packet is present.
+ *
+ * This routine is called when rechecking for packets after the RcvAvail
+ * interrupt has been cleared down. First, do a quick check of memory for
+ * a packet present. If not found, use an expensive CSR read of the context
+ * tail to determine the actual tail. The CSR read is necessary because there
+ * is no method to push pending DMAs to memory other than an interrupt and we
+ * are trying to determine if we need to force an interrupt.
+ */
+static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
+{
+ u32 tail;
+ int present;
+
+ if (!HFI1_CAP_IS_KSET(DMA_RTAIL))
+ present = (rcd->seq_cnt ==
+ rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))));
+ else /* is RDMA rtail */
+ present = (rcd->head != get_rcvhdrtail(rcd));
+
+ if (present)
+ return 1;
+
+ /* fall back to a CSR read, correct indpendent of DMA_RTAIL */
+ tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+ return rcd->head != tail;
+}
+
+/*
+ * Receive packet IRQ handler. This routine expects to be on its own IRQ.
+ * This routine will try to handle packets immediately (latency), but if
+ * it finds too many, it will invoke the thread handler (bandwitdh). The
+ * chip receive interrupt is *not* cleared down until this or the thread (if
+ * invoked) is finished. The intent is to avoid extra interrupts while we
+ * are processing packets anyway.
+ */
+static irqreturn_t receive_context_interrupt(int irq, void *data)
+{
+ struct hfi1_ctxtdata *rcd = data;
+ struct hfi1_devdata *dd = rcd->dd;
+ int disposition;
+ int present;
+
+ trace_hfi1_receive_interrupt(dd, rcd->ctxt);
+ this_cpu_inc(*dd->int_counter);
+ aspm_ctx_disable(rcd);
+
+ /* receive interrupt remains blocked while processing packets */
+ disposition = rcd->do_interrupt(rcd, 0);
+
+ /*
+ * Too many packets were seen while processing packets in this
+ * IRQ handler. Invoke the handler thread. The receive interrupt
+ * remains blocked.
+ */
+ if (disposition == RCV_PKT_LIMIT)
+ return IRQ_WAKE_THREAD;
+
+ /*
+ * The packet processor detected no more packets. Clear the receive
+ * interrupt and recheck for a packet packet that may have arrived
+ * after the previous check and interrupt clear. If a packet arrived,
+ * force another interrupt.
+ */
+ clear_recv_intr(rcd);
+ present = check_packet_present(rcd);
+ if (present)
+ force_recv_intr(rcd);
+
+ return IRQ_HANDLED;
+}
+
+/*
+ * Receive packet thread handler. This expects to be invoked with the
+ * receive interrupt still blocked.
+ */
+static irqreturn_t receive_context_thread(int irq, void *data)
+{
+ struct hfi1_ctxtdata *rcd = data;
+ int present;
+
+ /* receive interrupt is still blocked from the IRQ handler */
+ (void)rcd->do_interrupt(rcd, 1);
+
+ /*
+ * The packet processor will only return if it detected no more
+ * packets. Hold IRQs here so we can safely clear the interrupt and
+ * recheck for a packet that may have arrived after the previous
+ * check and the interrupt clear. If a packet arrived, force another
+ * interrupt.
+ */
+ local_irq_disable();
+ clear_recv_intr(rcd);
+ present = check_packet_present(rcd);
+ if (present)
+ force_recv_intr(rcd);
+ local_irq_enable();
+
+ return IRQ_HANDLED;
+}
+
+/* ========================================================================= */
+
+u32 read_physical_state(struct hfi1_devdata *dd)
+{
+ u64 reg;
+
+ reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+ return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
+ & DC_DC8051_STS_CUR_STATE_PORT_MASK;
+}
+
+u32 read_logical_state(struct hfi1_devdata *dd)
+{
+ u64 reg;
+
+ reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+ return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
+ & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
+}
+
+static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+ u64 reg;
+
+ reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+ /* clear current state, set new state */
+ reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
+ reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
+ write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
+}
+
+/*
+ * Use the 8051 to read a LCB CSR.
+ */
+static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+ u32 regno;
+ int ret;
+
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+ if (acquire_lcb_access(dd, 0) == 0) {
+ *data = read_csr(dd, addr);
+ release_lcb_access(dd, 0);
+ return 0;
+ }
+ return -EBUSY;
+ }
+
+ /* register is an index of LCB registers: (offset - base) / 8 */
+ regno = (addr - DC_LCB_CFG_RUN) >> 3;
+ ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
+ if (ret != HCMD_SUCCESS)
+ return -EBUSY;
+ return 0;
+}
+
+/*
+ * Read an LCB CSR. Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+
+ /* if up, go through the 8051 for the value */
+ if (ppd->host_link_state & HLS_UP)
+ return read_lcb_via_8051(dd, addr, data);
+ /* if going up or down, no access */
+ if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+ return -EBUSY;
+ /* otherwise, host has access */
+ *data = read_csr(dd, addr);
+ return 0;
+}
+
+/*
+ * Use the 8051 to write a LCB CSR.
+ */
+static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+ u32 regno;
+ int ret;
+
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
+ (dd->dc8051_ver < dc8051_ver(0, 20))) {
+ if (acquire_lcb_access(dd, 0) == 0) {
+ write_csr(dd, addr, data);
+ release_lcb_access(dd, 0);
+ return 0;
+ }
+ return -EBUSY;
+ }
+
+ /* register is an index of LCB registers: (offset - base) / 8 */
+ regno = (addr - DC_LCB_CFG_RUN) >> 3;
+ ret = do_8051_command(dd, HCMD_WRITE_LCB_CSR, regno, &data);
+ if (ret != HCMD_SUCCESS)
+ return -EBUSY;
+ return 0;
+}
+
+/*
+ * Write an LCB CSR. Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+
+ /* if up, go through the 8051 for the value */
+ if (ppd->host_link_state & HLS_UP)
+ return write_lcb_via_8051(dd, addr, data);
+ /* if going up or down, no access */
+ if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+ return -EBUSY;
+ /* otherwise, host has access */
+ write_csr(dd, addr, data);
+ return 0;
+}
+
+/*
+ * Returns:
+ * < 0 = Linux error, not able to get access
+ * > 0 = 8051 command RETURN_CODE
+ */
+static int do_8051_command(
+ struct hfi1_devdata *dd,
+ u32 type,
+ u64 in_data,
+ u64 *out_data)
+{
+ u64 reg, completed;
+ int return_code;
+ unsigned long flags;
+ unsigned long timeout;
+
+ hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
+
+ /*
+ * Alternative to holding the lock for a long time:
+ * - keep busy wait - have other users bounce off
+ */
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+
+ /* We can't send any commands to the 8051 if it's in reset */
+ if (dd->dc_shutdown) {
+ return_code = -ENODEV;
+ goto fail;
+ }
+
+ /*
+ * If an 8051 host command timed out previously, then the 8051 is
+ * stuck.
+ *
+ * On first timeout, attempt to reset and restart the entire DC
+ * block (including 8051). (Is this too big of a hammer?)
+ *
+ * If the 8051 times out a second time, the reset did not bring it
+ * back to healthy life. In that case, fail any subsequent commands.
+ */
+ if (dd->dc8051_timed_out) {
+ if (dd->dc8051_timed_out > 1) {
+ dd_dev_err(dd,
+ "Previous 8051 host command timed out, skipping command %u\n",
+ type);
+ return_code = -ENXIO;
+ goto fail;
+ }
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+ dc_shutdown(dd);
+ dc_start(dd);
+ spin_lock_irqsave(&dd->dc8051_lock, flags);
+ }
+
+ /*
+ * If there is no timeout, then the 8051 command interface is
+ * waiting for a command.
+ */
+
+ /*
+ * When writing a LCB CSR, out_data contains the full value to
+ * to be written, while in_data contains the relative LCB
+ * address in 7:0. Do the work here, rather than the caller,
+ * of distrubting the write data to where it needs to go:
+ *
+ * Write data
+ * 39:00 -> in_data[47:8]
+ * 47:40 -> DC8051_CFG_EXT_DEV_0.RETURN_CODE
+ * 63:48 -> DC8051_CFG_EXT_DEV_0.RSP_DATA
+ */
+ if (type == HCMD_WRITE_LCB_CSR) {
+ in_data |= ((*out_data) & 0xffffffffffull) << 8;
+ reg = ((((*out_data) >> 40) & 0xff) <<
+ DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT)
+ | ((((*out_data) >> 48) & 0xffff) <<
+ DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+ write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, reg);
+ }
+
+ /*
+ * Do two writes: the first to stabilize the type and req_data, the
+ * second to activate.
+ */
+ reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
+ << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
+ | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
+ << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
+ write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+ reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
+ write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+
+ /* wait for completion, alternate: interrupt */
+ timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
+ while (1) {
+ reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
+ completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
+ if (completed)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd->dc8051_timed_out++;
+ dd_dev_err(dd, "8051 host command %u timeout\n", type);
+ if (out_data)
+ *out_data = 0;
+ return_code = -ETIMEDOUT;
+ goto fail;
+ }
+ udelay(2);
+ }
+
+ if (out_data) {
+ *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
+ & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
+ if (type == HCMD_READ_LCB_CSR) {
+ /* top 16 bits are in a different register */
+ *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
+ & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
+ << (48
+ - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
+ }
+ }
+ return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
+ & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
+ dd->dc8051_timed_out = 0;
+ /*
+ * Clear command for next user.
+ */
+ write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
+
+fail:
+ spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+
+ return return_code;
+}
+
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
+{
+ return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
+}
+
+int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+ u8 lane_id, u32 config_data)
+{
+ u64 data;
+ int ret;
+
+ data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
+ | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
+ | (u64)config_data << LOAD_DATA_DATA_SHIFT;
+ ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "load 8051 config: field id %d, lane %d, err %d\n",
+ (int)field_id, (int)lane_id, ret);
+ }
+ return ret;
+}
+
+/*
+ * Read the 8051 firmware "registers". Use the RAM directly. Always
+ * set the result, even on error.
+ * Return 0 on success, -errno on failure
+ */
+int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
+ u32 *result)
+{
+ u64 big_data;
+ u32 addr;
+ int ret;
+
+ /* address start depends on the lane_id */
+ if (lane_id < 4)
+ addr = (4 * NUM_GENERAL_FIELDS)
+ + (lane_id * 4 * NUM_LANE_FIELDS);
+ else
+ addr = 0;
+ addr += field_id * 4;
+
+ /* read is in 8-byte chunks, hardware will truncate the address down */
+ ret = read_8051_data(dd, addr, 8, &big_data);
+
+ if (ret == 0) {
+ /* extract the 4 bytes we want */
+ if (addr & 0x4)
+ *result = (u32)(big_data >> 32);
+ else
+ *result = (u32)big_data;
+ } else {
+ *result = 0;
+ dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
+ __func__, lane_id, field_id);
+ }
+
+ return ret;
+}
+
+static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
+ u8 continuous)
+{
+ u32 frame;
+
+ frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
+ | power_management << POWER_MANAGEMENT_SHIFT;
+ return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
+ GENERAL_CONFIG, frame);
+}
+
+static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
+ u16 vl15buf, u8 crc_sizes)
+{
+ u32 frame;
+
+ frame = (u32)vau << VAU_SHIFT
+ | (u32)z << Z_SHIFT
+ | (u32)vcu << VCU_SHIFT
+ | (u32)vl15buf << VL15BUF_SHIFT
+ | (u32)crc_sizes << CRC_SIZES_SHIFT;
+ return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
+ GENERAL_CONFIG, frame);
+}
+
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+ u8 *flag_bits, u16 *link_widths)
+{
+ u32 frame;
+
+ read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+ &frame);
+ *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
+ *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
+ *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static int write_vc_local_link_width(struct hfi1_devdata *dd,
+ u8 misc_bits,
+ u8 flag_bits,
+ u16 link_widths)
+{
+ u32 frame;
+
+ frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
+ | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
+ | (u32)link_widths << LINK_WIDTH_SHIFT;
+ return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+ frame);
+}
+
+static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
+ u8 device_rev)
+{
+ u32 frame;
+
+ frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
+ | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
+ return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
+}
+
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+ u8 *device_rev)
+{
+ u32 frame;
+
+ read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
+ *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
+ *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
+ & REMOTE_DEVICE_REV_MASK;
+}
+
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
+{
+ u32 frame;
+
+ read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
+ *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
+ *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
+}
+
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+ u8 *continuous)
+{
+ u32 frame;
+
+ read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
+ *power_management = (frame >> POWER_MANAGEMENT_SHIFT)
+ & POWER_MANAGEMENT_MASK;
+ *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
+ & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
+}
+
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+ u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
+{
+ u32 frame;
+
+ read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
+ *vau = (frame >> VAU_SHIFT) & VAU_MASK;
+ *z = (frame >> Z_SHIFT) & Z_MASK;
+ *vcu = (frame >> VCU_SHIFT) & VCU_MASK;
+ *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
+ *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
+}
+
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+ u8 *remote_tx_rate,
+ u16 *link_widths)
+{
+ u32 frame;
+
+ read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
+ &frame);
+ *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
+ & REMOTE_TX_RATE_MASK;
+ *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
+{
+ u32 frame;
+
+ read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
+ *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
+}
+
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
+{
+ u32 frame;
+
+ read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
+ *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
+}
+
+static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
+{
+ read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
+}
+
+static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
+{
+ read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
+}
+
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
+{
+ u32 frame;
+ int ret;
+
+ *link_quality = 0;
+ if (dd->pport->host_link_state & HLS_UP) {
+ ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
+ &frame);
+ if (ret == 0)
+ *link_quality = (frame >> LINK_QUALITY_SHIFT)
+ & LINK_QUALITY_MASK;
+ }
+}
+
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
+{
+ u32 frame;
+
+ read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
+ *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
+}
+
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+ u32 frame;
+
+ read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+ *ldr = (frame & 0xff);
+}
+
+static int read_tx_settings(struct hfi1_devdata *dd,
+ u8 *enable_lane_tx,
+ u8 *tx_polarity_inversion,
+ u8 *rx_polarity_inversion,
+ u8 *max_rate)
+{
+ u32 frame;
+ int ret;
+
+ ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
+ *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
+ & ENABLE_LANE_TX_MASK;
+ *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
+ & TX_POLARITY_INVERSION_MASK;
+ *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
+ & RX_POLARITY_INVERSION_MASK;
+ *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
+ return ret;
+}
+
+static int write_tx_settings(struct hfi1_devdata *dd,
+ u8 enable_lane_tx,
+ u8 tx_polarity_inversion,
+ u8 rx_polarity_inversion,
+ u8 max_rate)
+{
+ u32 frame;
+
+ /* no need to mask, all variable sizes match field widths */
+ frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
+ | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
+ | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
+ | max_rate << MAX_RATE_SHIFT;
+ return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
+}
+
+/*
+ * Read an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
+{
+ int ret;
+
+ ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd, "read idle message: type %d, err %d\n",
+ (u32)type, ret);
+ return -EINVAL;
+ }
+ dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
+ /* return only the payload as we already know the type */
+ *data_out >>= IDLE_PAYLOAD_SHIFT;
+ return 0;
+}
+
+/*
+ * Read an idle SMA message. To be done in response to a notification from
+ * the 8051.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
+{
+ return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT,
+ data);
+}
+
+/*
+ * Send an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int send_idle_message(struct hfi1_devdata *dd, u64 data)
+{
+ int ret;
+
+ dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
+ ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
+ data, ret);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Send an idle SMA message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+int send_idle_sma(struct hfi1_devdata *dd, u64 message)
+{
+ u64 data;
+
+ data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) |
+ ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
+ return send_idle_message(dd, data);
+}
+
+/*
+ * Initialize the LCB then do a quick link up. This may or may not be
+ * in loopback.
+ *
+ * return 0 on success, -errno on error
+ */
+static int do_quick_linkup(struct hfi1_devdata *dd)
+{
+ u64 reg;
+ unsigned long timeout;
+ int ret;
+
+ lcb_shutdown(dd, 0);
+
+ if (loopback) {
+ /* LCB_CFG_LOOPBACK.VAL = 2 */
+ /* LCB_CFG_LANE_WIDTH.VAL = 0 */
+ write_csr(dd, DC_LCB_CFG_LOOPBACK,
+ IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
+ write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
+ }
+
+ /* start the LCBs */
+ /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+ /* simulator only loopback steps */
+ if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+ /* LCB_CFG_RUN.EN = 1 */
+ write_csr(dd, DC_LCB_CFG_RUN,
+ 1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+
+ /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
+ timeout = jiffies + msecs_to_jiffies(10);
+ while (1) {
+ reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE);
+ if (reg)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(dd,
+ "timeout waiting for LINK_TRANSFER_ACTIVE\n");
+ return -ETIMEDOUT;
+ }
+ udelay(2);
+ }
+
+ write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
+ 1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
+ }
+
+ if (!loopback) {
+ /*
+ * When doing quick linkup and not in loopback, both
+ * sides must be done with LCB set-up before either
+ * starts the quick linkup. Put a delay here so that
+ * both sides can be started and have a chance to be
+ * done with LCB set up before resuming.
+ */
+ dd_dev_err(dd,
+ "Pausing for peer to be finished with LCB set up\n");
+ msleep(5000);
+ dd_dev_err(dd, "Continuing with quick linkup\n");
+ }
+
+ write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+ set_8051_lcb_access(dd);
+
+ /*
+ * State "quick" LinkUp request sets the physical link state to
+ * LinkUp without a verify capability sequence.
+ * This state is in simulator v37 and later.
+ */
+ ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "%s: set physical link state to quick LinkUp failed with return %d\n",
+ __func__, ret);
+
+ set_host_lcb_access(dd);
+ write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+
+ if (ret >= 0)
+ ret = -EINVAL;
+ return ret;
+ }
+
+ return 0; /* success */
+}
+
+/*
+ * Set the SerDes to internal loopback mode.
+ * Returns 0 on success, -errno on error.
+ */
+static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
+ if (ret == HCMD_SUCCESS)
+ return 0;
+ dd_dev_err(dd,
+ "Set physical link state to SerDes Loopback failed with return %d\n",
+ ret);
+ if (ret >= 0)
+ ret = -EINVAL;
+ return ret;
+}
+
+/*
+ * Do all special steps to set up loopback.
+ */
+static int init_loopback(struct hfi1_devdata *dd)
+{
+ dd_dev_info(dd, "Entering loopback mode\n");
+
+ /* all loopbacks should disable self GUID check */
+ write_csr(dd, DC_DC8051_CFG_MODE,
+ (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
+
+ /*
+ * The simulator has only one loopback option - LCB. Switch
+ * to that option, which includes quick link up.
+ *
+ * Accept all valid loopback values.
+ */
+ if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) &&
+ (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
+ loopback == LOOPBACK_CABLE)) {
+ loopback = LOOPBACK_LCB;
+ quick_linkup = 1;
+ return 0;
+ }
+
+ /* handle serdes loopback */
+ if (loopback == LOOPBACK_SERDES) {
+ /* internal serdes loopack needs quick linkup on RTL */
+ if (dd->icode == ICODE_RTL_SILICON)
+ quick_linkup = 1;
+ return set_serdes_loopback_mode(dd);
+ }
+
+ /* LCB loopback - handled at poll time */
+ if (loopback == LOOPBACK_LCB) {
+ quick_linkup = 1; /* LCB is always quick linkup */
+
+ /* not supported in emulation due to emulation RTL changes */
+ if (dd->icode == ICODE_FPGA_EMULATION) {
+ dd_dev_err(dd,
+ "LCB loopback not supported in emulation\n");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ /* external cable loopback requires no extra steps */
+ if (loopback == LOOPBACK_CABLE)
+ return 0;
+
+ dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
+ return -EINVAL;
+}
+
+/*
+ * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
+ * used in the Verify Capability link width attribute.
+ */
+static u16 opa_to_vc_link_widths(u16 opa_widths)
+{
+ int i;
+ u16 result = 0;
+
+ static const struct link_bits {
+ u16 from;
+ u16 to;
+ } opa_link_xlate[] = {
+ { OPA_LINK_WIDTH_1X, 1 << (1 - 1) },
+ { OPA_LINK_WIDTH_2X, 1 << (2 - 1) },
+ { OPA_LINK_WIDTH_3X, 1 << (3 - 1) },
+ { OPA_LINK_WIDTH_4X, 1 << (4 - 1) },
+ };
+
+ for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
+ if (opa_widths & opa_link_xlate[i].from)
+ result |= opa_link_xlate[i].to;
+ }
+ return result;
+}
+
+/*
+ * Set link attributes before moving to polling.
+ */
+static int set_local_link_attributes(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u8 enable_lane_tx;
+ u8 tx_polarity_inversion;
+ u8 rx_polarity_inversion;
+ int ret;
+
+ /* reset our fabric serdes to clear any lingering problems */
+ fabric_serdes_reset(dd);
+
+ /* set the local tx rate - need to read-modify-write */
+ ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+ &rx_polarity_inversion, &ppd->local_tx_rate);
+ if (ret)
+ goto set_local_link_attributes_fail;
+
+ if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+ /* set the tx rate to the fastest enabled */
+ if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+ ppd->local_tx_rate = 1;
+ else
+ ppd->local_tx_rate = 0;
+ } else {
+ /* set the tx rate to all enabled */
+ ppd->local_tx_rate = 0;
+ if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+ ppd->local_tx_rate |= 2;
+ if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
+ ppd->local_tx_rate |= 1;
+ }
+
+ enable_lane_tx = 0xF; /* enable all four lanes */
+ ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
+ rx_polarity_inversion, ppd->local_tx_rate);
+ if (ret != HCMD_SUCCESS)
+ goto set_local_link_attributes_fail;
+
+ /*
+ * DC supports continuous updates.
+ */
+ ret = write_vc_local_phy(dd,
+ 0 /* no power management */,
+ 1 /* continuous updates */);
+ if (ret != HCMD_SUCCESS)
+ goto set_local_link_attributes_fail;
+
+ /* z=1 in the next call: AU of 0 is not supported by the hardware */
+ ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
+ ppd->port_crc_mode_enabled);
+ if (ret != HCMD_SUCCESS)
+ goto set_local_link_attributes_fail;
+
+ ret = write_vc_local_link_width(dd, 0, 0,
+ opa_to_vc_link_widths(
+ ppd->link_width_enabled));
+ if (ret != HCMD_SUCCESS)
+ goto set_local_link_attributes_fail;
+
+ /* let peer know who we are */
+ ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
+ if (ret == HCMD_SUCCESS)
+ return 0;
+
+set_local_link_attributes_fail:
+ dd_dev_err(dd,
+ "Failed to set local link attributes, return 0x%x\n",
+ ret);
+ return ret;
+}
+
+/*
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
+ */
+int start_link(struct hfi1_pportdata *ppd)
+{
+ if (!ppd->link_enabled) {
+ dd_dev_info(ppd->dd,
+ "%s: stopping link start because link is disabled\n",
+ __func__);
+ return 0;
+ }
+ if (!ppd->driver_link_ready) {
+ dd_dev_info(ppd->dd,
+ "%s: stopping link start because driver is not ready\n",
+ __func__);
+ return 0;
+ }
+
+ /*
+ * FULL_MGMT_P_KEY is cleared from the pkey table, so that the
+ * pkey table can be configured properly if the HFI unit is connected
+ * to switch port with MgmtAllowed=NO
+ */
+ clear_full_mgmt_pkey(ppd);
+
+ return set_link_state(ppd, HLS_DN_POLL);
+}
+
+static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 mask;
+ unsigned long timeout;
+
+ /*
+ * Some QSFP cables have a quirk that asserts the IntN line as a side
+ * effect of power up on plug-in. We ignore this false positive
+ * interrupt until the module has finished powering up by waiting for
+ * a minimum timeout of the module inrush initialization time of
+ * 500 ms (SFF 8679 Table 5-6) to ensure the voltage rails in the
+ * module have stabilized.
+ */
+ msleep(500);
+
+ /*
+ * Check for QSFP interrupt for t_init (SFF 8679 Table 8-1)
+ */
+ timeout = jiffies + msecs_to_jiffies(2000);
+ while (1) {
+ mask = read_csr(dd, dd->hfi1_id ?
+ ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+ if (!(mask & QSFP_HFI0_INT_N))
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
+ __func__);
+ break;
+ }
+ udelay(2);
+ }
+}
+
+static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 mask;
+
+ mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK);
+ if (enable) {
+ /*
+ * Clear the status register to avoid an immediate interrupt
+ * when we re-enable the IntN pin
+ */
+ write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
+ QSFP_HFI0_INT_N);
+ mask |= (u64)QSFP_HFI0_INT_N;
+ } else {
+ mask &= ~(u64)QSFP_HFI0_INT_N;
+ }
+ write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask);
+}
+
+void reset_qsfp(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 mask, qsfp_mask;
+
+ /* Disable INT_N from triggering QSFP interrupts */
+ set_qsfp_int_n(ppd, 0);
+
+ /* Reset the QSFP */
+ mask = (u64)QSFP_HFI0_RESET_N;
+
+ qsfp_mask = read_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
+ qsfp_mask &= ~mask;
+ write_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
+
+ udelay(10);
+
+ qsfp_mask |= mask;
+ write_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask);
+
+ wait_for_qsfp_init(ppd);
+
+ /*
+ * Allow INT_N to trigger the QSFP interrupt to watch
+ * for alarms and warnings
+ */
+ set_qsfp_int_n(ppd, 1);
+}
+
+static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
+ u8 *qsfp_interrupt_status)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
+ (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
+ dd_dev_info(dd, "%s: QSFP cable on fire\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
+ (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
+ dd_dev_info(dd, "%s: QSFP cable temperature too low\n",
+ __func__);
+
+ /*
+ * The remaining alarms/warnings don't matter if the link is down.
+ */
+ if (ppd->host_link_state & HLS_DOWN)
+ return 0;
+
+ if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
+ (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
+ dd_dev_info(dd, "%s: QSFP supply voltage too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
+ (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
+ dd_dev_info(dd, "%s: QSFP supply voltage too low\n",
+ __func__);
+
+ /* Byte 2 is vendor specific */
+
+ if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
+ (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
+ dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
+ (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
+ dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
+ (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
+ dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
+ (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
+ dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
+ (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
+ dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
+ (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
+ dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
+ (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
+ dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
+ (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
+ dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
+ (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
+ dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
+ (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
+ dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
+ (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
+ dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n",
+ __func__);
+
+ if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
+ (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
+ dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n",
+ __func__);
+
+ /* Bytes 9-10 and 11-12 are reserved */
+ /* Bytes 13-15 are vendor specific */
+
+ return 0;
+}
+
+/* This routine will only be scheduled if the QSFP module present is asserted */
+void qsfp_event(struct work_struct *work)
+{
+ struct qsfp_data *qd;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+
+ qd = container_of(work, struct qsfp_data, qsfp_work);
+ ppd = qd->ppd;
+ dd = ppd->dd;
+
+ /* Sanity check */
+ if (!qsfp_mod_present(ppd))
+ return;
+
+ /*
+ * Turn DC back on after cable has been re-inserted. Up until
+ * now, the DC has been in reset to save power.
+ */
+ dc_start(dd);
+
+ if (qd->cache_refresh_required) {
+ set_qsfp_int_n(ppd, 0);
+
+ wait_for_qsfp_init(ppd);
+
+ /*
+ * Allow INT_N to trigger the QSFP interrupt to watch
+ * for alarms and warnings
+ */
+ set_qsfp_int_n(ppd, 1);
+
+ tune_serdes(ppd);
+
+ start_link(ppd);
+ }
+
+ if (qd->check_interrupt_flags) {
+ u8 qsfp_interrupt_status[16] = {0,};
+
+ if (one_qsfp_read(ppd, dd->hfi1_id, 6,
+ &qsfp_interrupt_status[0], 16) != 16) {
+ dd_dev_info(dd,
+ "%s: Failed to read status of QSFP module\n",
+ __func__);
+ } else {
+ unsigned long flags;
+
+ handle_qsfp_error_conditions(
+ ppd, qsfp_interrupt_status);
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ ppd->qsfp_info.check_interrupt_flags = 0;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+ flags);
+ }
+ }
+}
+
+static void init_qsfp_int(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd = dd->pport;
+ u64 qsfp_mask, cce_int_mask;
+ const int qsfp1_int_smask = QSFP1_INT % 64;
+ const int qsfp2_int_smask = QSFP2_INT % 64;
+
+ /*
+ * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
+ * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
+ * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
+ * the index of the appropriate CSR in the CCEIntMask CSR array
+ */
+ cce_int_mask = read_csr(dd, CCE_INT_MASK +
+ (8 * (QSFP1_INT / 64)));
+ if (dd->hfi1_id) {
+ cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
+ write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
+ cce_int_mask);
+ } else {
+ cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
+ write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
+ cce_int_mask);
+ }
+
+ qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+ /* Clear current status to avoid spurious interrupts */
+ write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
+ qsfp_mask);
+ write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
+ qsfp_mask);
+
+ set_qsfp_int_n(ppd, 0);
+
+ /* Handle active low nature of INT_N and MODPRST_N pins */
+ if (qsfp_mod_present(ppd))
+ qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
+ write_csr(dd,
+ dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
+ qsfp_mask);
+}
+
+/*
+ * Do a one-time initialize of the LCB block.
+ */
+static void init_lcb(struct hfi1_devdata *dd)
+{
+ /* simulator does not correctly handle LCB cclk loopback, skip */
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+ return;
+
+ /* the DC has been reset earlier in the driver load */
+
+ /* set LCB for cclk loopback on the port */
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x01);
+ write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0x00);
+ write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0x00);
+ write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110);
+ write_csr(dd, DC_LCB_CFG_CLK_CNTR, 0x08);
+ write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x02);
+ write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x00);
+}
+
+/*
+ * Perform a test read on the QSFP. Return 0 on success, -ERRNO
+ * on error.
+ */
+static int test_qsfp_read(struct hfi1_pportdata *ppd)
+{
+ int ret;
+ u8 status;
+
+ /* report success if not a QSFP */
+ if (ppd->port_type != PORT_TYPE_QSFP)
+ return 0;
+
+ /* read byte 2, the status byte */
+ ret = one_qsfp_read(ppd, ppd->dd->hfi1_id, 2, &status, 1);
+ if (ret < 0)
+ return ret;
+ if (ret != 1)
+ return -EIO;
+
+ return 0; /* success */
+}
+
+/*
+ * Values for QSFP retry.
+ *
+ * Give up after 10s (20 x 500ms). The overall timeout was empirically
+ * arrived at from experience on a large cluster.
+ */
+#define MAX_QSFP_RETRIES 20
+#define QSFP_RETRY_WAIT 500 /* msec */
+
+/*
+ * Try a QSFP read. If it fails, schedule a retry for later.
+ * Called on first link activation after driver load.
+ */
+static void try_start_link(struct hfi1_pportdata *ppd)
+{
+ if (test_qsfp_read(ppd)) {
+ /* read failed */
+ if (ppd->qsfp_retry_count >= MAX_QSFP_RETRIES) {
+ dd_dev_err(ppd->dd, "QSFP not responding, giving up\n");
+ return;
+ }
+ dd_dev_info(ppd->dd,
+ "QSFP not responding, waiting and retrying %d\n",
+ (int)ppd->qsfp_retry_count);
+ ppd->qsfp_retry_count++;
+ queue_delayed_work(ppd->hfi1_wq, &ppd->start_link_work,
+ msecs_to_jiffies(QSFP_RETRY_WAIT));
+ return;
+ }
+ ppd->qsfp_retry_count = 0;
+
+ /*
+ * Tune the SerDes to a ballpark setting for optimal signal and bit
+ * error rate. Needs to be done before starting the link.
+ */
+ tune_serdes(ppd);
+ start_link(ppd);
+}
+
+/*
+ * Workqueue function to start the link after a delay.
+ */
+void handle_start_link(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ start_link_work.work);
+ try_start_link(ppd);
+}
+
+int bringup_serdes(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 guid;
+ int ret;
+
+ if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
+ add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
+
+ guid = ppd->guid;
+ if (!guid) {
+ if (dd->base_guid)
+ guid = dd->base_guid + ppd->port - 1;
+ ppd->guid = guid;
+ }
+
+ /* Set linkinit_reason on power up per OPA spec */
+ ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
+
+ /* one-time init of the LCB */
+ init_lcb(dd);
+
+ if (loopback) {
+ ret = init_loopback(dd);
+ if (ret < 0)
+ return ret;
+ }
+
+ get_port_type(ppd);
+ if (ppd->port_type == PORT_TYPE_QSFP) {
+ set_qsfp_int_n(ppd, 0);
+ wait_for_qsfp_init(ppd);
+ set_qsfp_int_n(ppd, 1);
+ }
+
+ try_start_link(ppd);
+ return 0;
+}
+
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ /*
+ * Shut down the link and keep it down. First turn off that the
+ * driver wants to allow the link to be up (driver_link_ready).
+ * Then make sure the link is not automatically restarted
+ * (link_enabled). Cancel any pending restart. And finally
+ * go offline.
+ */
+ ppd->driver_link_ready = 0;
+ ppd->link_enabled = 0;
+
+ ppd->qsfp_retry_count = MAX_QSFP_RETRIES; /* prevent more retries */
+ flush_delayed_work(&ppd->start_link_work);
+ cancel_delayed_work_sync(&ppd->start_link_work);
+
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
+ set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
+ OPA_LINKDOWN_REASON_SMA_DISABLED);
+ set_link_state(ppd, HLS_DN_OFFLINE);
+
+ /* disable the port */
+ clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+static inline int init_cpu_counters(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ ppd->ibport_data.rvp.rc_acks = NULL;
+ ppd->ibport_data.rvp.rc_qacks = NULL;
+ ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64);
+ ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64);
+ ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64);
+ if (!ppd->ibport_data.rvp.rc_acks ||
+ !ppd->ibport_data.rvp.rc_delayed_comp ||
+ !ppd->ibport_data.rvp.rc_qacks)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static const char * const pt_names[] = {
+ "expected",
+ "eager",
+ "invalid"
+};
+
+static const char *pt_name(u32 type)
+{
+ return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
+}
+
+/*
+ * index is the index into the receive array
+ */
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+ u32 type, unsigned long pa, u16 order)
+{
+ u64 reg;
+ void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
+ (dd->kregbase + RCV_ARRAY));
+
+ if (!(dd->flags & HFI1_PRESENT))
+ goto done;
+
+ if (type == PT_INVALID) {
+ pa = 0;
+ } else if (type > PT_INVALID) {
+ dd_dev_err(dd,
+ "unexpected receive array type %u for index %u, not handled\n",
+ type, index);
+ goto done;
+ }
+
+ hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
+ pt_name(type), index, pa, (unsigned long)order);
+
+#define RT_ADDR_SHIFT 12 /* 4KB kernel address boundary */
+ reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
+ | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
+ | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
+ << RCV_ARRAY_RT_ADDR_SHIFT;
+ writeq(reg, base + (index * 8));
+
+ if (type == PT_EAGER)
+ /*
+ * Eager entries are written one-by-one so we have to push them
+ * after we write the entry.
+ */
+ flush_wc();
+done:
+ return;
+}
+
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 i;
+
+ /* this could be optimized */
+ for (i = rcd->eager_base; i < rcd->eager_base +
+ rcd->egrbufs.alloced; i++)
+ hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+
+ for (i = rcd->expected_base;
+ i < rcd->expected_base + rcd->expected_count; i++)
+ hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+}
+
+struct hfi1_message_header *hfi1_get_msgheader(
+ struct hfi1_devdata *dd, __le32 *rhf_addr)
+{
+ u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
+
+ return (struct hfi1_message_header *)
+ (rhf_addr - dd->rhf_offset + offset);
+}
+
+static const char * const ib_cfg_name_strings[] = {
+ "HFI1_IB_CFG_LIDLMC",
+ "HFI1_IB_CFG_LWID_DG_ENB",
+ "HFI1_IB_CFG_LWID_ENB",
+ "HFI1_IB_CFG_LWID",
+ "HFI1_IB_CFG_SPD_ENB",
+ "HFI1_IB_CFG_SPD",
+ "HFI1_IB_CFG_RXPOL_ENB",
+ "HFI1_IB_CFG_LREV_ENB",
+ "HFI1_IB_CFG_LINKLATENCY",
+ "HFI1_IB_CFG_HRTBT",
+ "HFI1_IB_CFG_OP_VLS",
+ "HFI1_IB_CFG_VL_HIGH_CAP",
+ "HFI1_IB_CFG_VL_LOW_CAP",
+ "HFI1_IB_CFG_OVERRUN_THRESH",
+ "HFI1_IB_CFG_PHYERR_THRESH",
+ "HFI1_IB_CFG_LINKDEFAULT",
+ "HFI1_IB_CFG_PKEYS",
+ "HFI1_IB_CFG_MTU",
+ "HFI1_IB_CFG_LSTATE",
+ "HFI1_IB_CFG_VL_HIGH_LIMIT",
+ "HFI1_IB_CFG_PMA_TICKS",
+ "HFI1_IB_CFG_PORT"
+};
+
+static const char *ib_cfg_name(int which)
+{
+ if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
+ return "invalid";
+ return ib_cfg_name_strings[which];
+}
+
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ int val = 0;
+
+ switch (which) {
+ case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
+ val = ppd->link_width_enabled;
+ break;
+ case HFI1_IB_CFG_LWID: /* currently active Link-width */
+ val = ppd->link_width_active;
+ break;
+ case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+ val = ppd->link_speed_enabled;
+ break;
+ case HFI1_IB_CFG_SPD: /* current Link speed */
+ val = ppd->link_speed_active;
+ break;
+
+ case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
+ case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
+ case HFI1_IB_CFG_LINKLATENCY:
+ goto unimplemented;
+
+ case HFI1_IB_CFG_OP_VLS:
+ val = ppd->vls_operational;
+ break;
+ case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
+ val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
+ break;
+ case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
+ val = VL_ARB_LOW_PRIO_TABLE_SIZE;
+ break;
+ case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+ val = ppd->overrun_threshold;
+ break;
+ case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+ val = ppd->phy_error_threshold;
+ break;
+ case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+ val = dd->link_default;
+ break;
+
+ case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
+ case HFI1_IB_CFG_PMA_TICKS:
+ default:
+unimplemented:
+ if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+ dd_dev_info(
+ dd,
+ "%s: which %s: not implemented\n",
+ __func__,
+ ib_cfg_name(which));
+ break;
+ }
+
+ return val;
+}
+
+/*
+ * The largest MAD packet size.
+ */
+#define MAX_MAD_PACKET 2048
+
+/*
+ * Return the maximum header bytes that can go on the _wire_
+ * for this device. This count includes the ICRC which is
+ * not part of the packet held in memory but it is appended
+ * by the HW.
+ * This is dependent on the device's receive header entry size.
+ * HFI allows this to be set per-receive context, but the
+ * driver presently enforces a global value.
+ */
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
+{
+ /*
+ * The maximum non-payload (MTU) bytes in LRH.PktLen are
+ * the Receive Header Entry Size minus the PBC (or RHF) size
+ * plus one DW for the ICRC appended by HW.
+ *
+ * dd->rcd[0].rcvhdrqentsize is in DW.
+ * We use rcd[0] as all context will have the same value. Also,
+ * the first kernel context would have been allocated by now so
+ * we are guaranteed a valid value.
+ */
+ return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
+}
+
+/*
+ * Set Send Length
+ * @ppd - per port data
+ *
+ * Set the MTU by limiting how many DWs may be sent. The SendLenCheck*
+ * registers compare against LRH.PktLen, so use the max bytes included
+ * in the LRH.
+ *
+ * This routine changes all VL values except VL15, which it maintains at
+ * the same value.
+ */
+static void set_send_length(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 max_hb = lrh_max_header_bytes(dd), dcmtu;
+ u32 maxvlmtu = dd->vld[15].mtu;
+ u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
+ & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
+ SEND_LEN_CHECK1_LEN_VL15_SHIFT;
+ int i, j;
+ u32 thres;
+
+ for (i = 0; i < ppd->vls_supported; i++) {
+ if (dd->vld[i].mtu > maxvlmtu)
+ maxvlmtu = dd->vld[i].mtu;
+ if (i <= 3)
+ len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
+ & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
+ ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
+ else
+ len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
+ & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
+ ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
+ }
+ write_csr(dd, SEND_LEN_CHECK0, len1);
+ write_csr(dd, SEND_LEN_CHECK1, len2);
+ /* adjust kernel credit return thresholds based on new MTUs */
+ /* all kernel receive contexts have the same hdrqentsize */
+ for (i = 0; i < ppd->vls_supported; i++) {
+ thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+ sc_mtu_to_threshold(dd->vld[i].sc,
+ dd->vld[i].mtu,
+ dd->rcd[0]->rcvhdrqentsize));
+ for (j = 0; j < INIT_SC_PER_VL; j++)
+ sc_set_cr_threshold(
+ pio_select_send_context_vl(dd, j, i),
+ thres);
+ }
+ thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+ sc_mtu_to_threshold(dd->vld[15].sc,
+ dd->vld[15].mtu,
+ dd->rcd[0]->rcvhdrqentsize));
+ sc_set_cr_threshold(dd->vld[15].sc, thres);
+
+ /* Adjust maximum MTU for the port in DC */
+ dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
+ (ilog2(maxvlmtu >> 8) + 1);
+ len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
+ len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
+ len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
+ DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
+ write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
+}
+
+static void set_lidlmc(struct hfi1_pportdata *ppd)
+{
+ int i;
+ u64 sreg = 0;
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 mask = ~((1U << ppd->lmc) - 1);
+ u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
+
+ if (dd->hfi1_snoop.mode_flag)
+ dd_dev_info(dd, "Set lid/lmc while snooping");
+
+ c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
+ | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
+ c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
+ << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) |
+ ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
+ << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
+ write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
+
+ /*
+ * Iterate over all the send contexts and set their SLID check
+ */
+ sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
+ SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
+ (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
+ SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
+
+ for (i = 0; i < dd->chip_send_contexts; i++) {
+ hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
+ i, (u32)sreg);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
+ }
+
+ /* Now we have to do the same thing for the sdma engines */
+ sdma_update_lmc(dd, mask, ppd->lid);
+}
+
+static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
+{
+ unsigned long timeout;
+ u32 curr_state;
+
+ timeout = jiffies + msecs_to_jiffies(msecs);
+ while (1) {
+ curr_state = read_physical_state(dd);
+ if (curr_state == state)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(dd,
+ "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
+ state, curr_state);
+ return -ETIMEDOUT;
+ }
+ usleep_range(1950, 2050); /* sleep 2ms-ish */
+ }
+
+ return 0;
+}
+
+static const char *state_completed_string(u32 completed)
+{
+ static const char * const state_completed[] = {
+ "EstablishComm",
+ "OptimizeEQ",
+ "VerifyCap"
+ };
+
+ if (completed < ARRAY_SIZE(state_completed))
+ return state_completed[completed];
+
+ return "unknown";
+}
+
+static const char all_lanes_dead_timeout_expired[] =
+ "All lanes were inactive – was the interconnect media removed?";
+static const char tx_out_of_policy[] =
+ "Passing lanes on local port do not meet the local link width policy";
+static const char no_state_complete[] =
+ "State timeout occurred before link partner completed the state";
+static const char * const state_complete_reasons[] = {
+ [0x00] = "Reason unknown",
+ [0x01] = "Link was halted by driver, refer to LinkDownReason",
+ [0x02] = "Link partner reported failure",
+ [0x10] = "Unable to achieve frame sync on any lane",
+ [0x11] =
+ "Unable to find a common bit rate with the link partner",
+ [0x12] =
+ "Unable to achieve frame sync on sufficient lanes to meet the local link width policy",
+ [0x13] =
+ "Unable to identify preset equalization on sufficient lanes to meet the local link width policy",
+ [0x14] = no_state_complete,
+ [0x15] =
+ "State timeout occurred before link partner identified equalization presets",
+ [0x16] =
+ "Link partner completed the EstablishComm state, but the passing lanes do not meet the local link width policy",
+ [0x17] = tx_out_of_policy,
+ [0x20] = all_lanes_dead_timeout_expired,
+ [0x21] =
+ "Unable to achieve acceptable BER on sufficient lanes to meet the local link width policy",
+ [0x22] = no_state_complete,
+ [0x23] =
+ "Link partner completed the OptimizeEq state, but the passing lanes do not meet the local link width policy",
+ [0x24] = tx_out_of_policy,
+ [0x30] = all_lanes_dead_timeout_expired,
+ [0x31] =
+ "State timeout occurred waiting for host to process received frames",
+ [0x32] = no_state_complete,
+ [0x33] =
+ "Link partner completed the VerifyCap state, but the passing lanes do not meet the local link width policy",
+ [0x34] = tx_out_of_policy,
+};
+
+static const char *state_complete_reason_code_string(struct hfi1_pportdata *ppd,
+ u32 code)
+{
+ const char *str = NULL;
+
+ if (code < ARRAY_SIZE(state_complete_reasons))
+ str = state_complete_reasons[code];
+
+ if (str)
+ return str;
+ return "Reserved";
+}
+
+/* describe the given last state complete frame */
+static void decode_state_complete(struct hfi1_pportdata *ppd, u32 frame,
+ const char *prefix)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 success;
+ u32 state;
+ u32 reason;
+ u32 lanes;
+
+ /*
+ * Decode frame:
+ * [ 0: 0] - success
+ * [ 3: 1] - state
+ * [ 7: 4] - next state timeout
+ * [15: 8] - reason code
+ * [31:16] - lanes
+ */
+ success = frame & 0x1;
+ state = (frame >> 1) & 0x7;
+ reason = (frame >> 8) & 0xff;
+ lanes = (frame >> 16) & 0xffff;
+
+ dd_dev_err(dd, "Last %s LNI state complete frame 0x%08x:\n",
+ prefix, frame);
+ dd_dev_err(dd, " last reported state state: %s (0x%x)\n",
+ state_completed_string(state), state);
+ dd_dev_err(dd, " state successfully completed: %s\n",
+ success ? "yes" : "no");
+ dd_dev_err(dd, " fail reason 0x%x: %s\n",
+ reason, state_complete_reason_code_string(ppd, reason));
+ dd_dev_err(dd, " passing lane mask: 0x%x", lanes);
+}
+
+/*
+ * Read the last state complete frames and explain them. This routine
+ * expects to be called if the link went down during link negotiation
+ * and initialization (LNI). That is, anywhere between polling and link up.
+ */
+static void check_lni_states(struct hfi1_pportdata *ppd)
+{
+ u32 last_local_state;
+ u32 last_remote_state;
+
+ read_last_local_state(ppd->dd, &last_local_state);
+ read_last_remote_state(ppd->dd, &last_remote_state);
+
+ /*
+ * Don't report anything if there is nothing to report. A value of
+ * 0 means the link was taken down while polling and there was no
+ * training in-process.
+ */
+ if (last_local_state == 0 && last_remote_state == 0)
+ return;
+
+ decode_state_complete(ppd, last_local_state, "transmitted");
+ decode_state_complete(ppd, last_remote_state, "received");
+}
+
+/*
+ * Helper for set_link_state(). Do not call except from that routine.
+ * Expects ppd->hls_mutex to be held.
+ *
+ * @rem_reason value to be sent to the neighbor
+ *
+ * LinkDownReasons only set if transition succeeds.
+ */
+static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 pstate, previous_state;
+ int ret;
+ int do_transition;
+ int do_wait;
+
+ previous_state = ppd->host_link_state;
+ ppd->host_link_state = HLS_GOING_OFFLINE;
+ pstate = read_physical_state(dd);
+ if (pstate == PLS_OFFLINE) {
+ do_transition = 0; /* in right state */
+ do_wait = 0; /* ...no need to wait */
+ } else if ((pstate & 0xff) == PLS_OFFLINE) {
+ do_transition = 0; /* in an offline transient state */
+ do_wait = 1; /* ...wait for it to settle */
+ } else {
+ do_transition = 1; /* need to move to offline */
+ do_wait = 1; /* ...will need to wait */
+ }
+
+ if (do_transition) {
+ ret = set_physical_link_state(dd,
+ (rem_reason << 8) | PLS_OFFLINE);
+
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "Failed to transition to Offline link state, return %d\n",
+ ret);
+ return -EINVAL;
+ }
+ if (ppd->offline_disabled_reason ==
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT);
+ }
+
+ if (do_wait) {
+ /* it can take a while for the link to go down */
+ ret = wait_phy_linkstate(dd, PLS_OFFLINE, 10000);
+ if (ret < 0)
+ return ret;
+ }
+
+ /* make sure the logical state is also down */
+ wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
+
+ /*
+ * Now in charge of LCB - must be after the physical state is
+ * offline.quiet and before host_link_state is changed.
+ */
+ set_host_lcb_access(dd);
+ write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+ ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
+
+ if (ppd->port_type == PORT_TYPE_QSFP &&
+ ppd->qsfp_info.limiting_active &&
+ qsfp_mod_present(ppd)) {
+ int ret;
+
+ ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT);
+ if (ret == 0) {
+ set_qsfp_tx(ppd, 0);
+ release_chip_resource(dd, qsfp_resource(dd));
+ } else {
+ /* not fatal, but should warn */
+ dd_dev_err(dd,
+ "Unable to acquire lock to turn off QSFP TX\n");
+ }
+ }
+
+ /*
+ * The LNI has a mandatory wait time after the physical state
+ * moves to Offline.Quiet. The wait time may be different
+ * depending on how the link went down. The 8051 firmware
+ * will observe the needed wait time and only move to ready
+ * when that is completed. The largest of the quiet timeouts
+ * is 6s, so wait that long and then at least 0.5s more for
+ * other transitions, and another 0.5s for a buffer.
+ */
+ ret = wait_fm_ready(dd, 7000);
+ if (ret) {
+ dd_dev_err(dd,
+ "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
+ /* state is really offline, so make it so */
+ ppd->host_link_state = HLS_DN_OFFLINE;
+ return ret;
+ }
+
+ /*
+ * The state is now offline and the 8051 is ready to accept host
+ * requests.
+ * - change our state
+ * - notify others if we were previously in a linkup state
+ */
+ ppd->host_link_state = HLS_DN_OFFLINE;
+ if (previous_state & HLS_UP) {
+ /* went down while link was up */
+ handle_linkup_change(dd, 0);
+ } else if (previous_state
+ & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+ /* went down while attempting link up */
+ check_lni_states(ppd);
+ }
+
+ /* the active link width (downgrade) is 0 on link down */
+ ppd->link_width_active = 0;
+ ppd->link_width_downgrade_tx_active = 0;
+ ppd->link_width_downgrade_rx_active = 0;
+ ppd->current_egress_rate = 0;
+ return 0;
+}
+
+/* return the link state name */
+static const char *link_state_name(u32 state)
+{
+ const char *name;
+ int n = ilog2(state);
+ static const char * const names[] = {
+ [__HLS_UP_INIT_BP] = "INIT",
+ [__HLS_UP_ARMED_BP] = "ARMED",
+ [__HLS_UP_ACTIVE_BP] = "ACTIVE",
+ [__HLS_DN_DOWNDEF_BP] = "DOWNDEF",
+ [__HLS_DN_POLL_BP] = "POLL",
+ [__HLS_DN_DISABLE_BP] = "DISABLE",
+ [__HLS_DN_OFFLINE_BP] = "OFFLINE",
+ [__HLS_VERIFY_CAP_BP] = "VERIFY_CAP",
+ [__HLS_GOING_UP_BP] = "GOING_UP",
+ [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
+ [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
+ };
+
+ name = n < ARRAY_SIZE(names) ? names[n] : NULL;
+ return name ? name : "unknown";
+}
+
+/* return the link state reason name */
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
+{
+ if (state == HLS_UP_INIT) {
+ switch (ppd->linkinit_reason) {
+ case OPA_LINKINIT_REASON_LINKUP:
+ return "(LINKUP)";
+ case OPA_LINKINIT_REASON_FLAPPING:
+ return "(FLAPPING)";
+ case OPA_LINKINIT_OUTSIDE_POLICY:
+ return "(OUTSIDE_POLICY)";
+ case OPA_LINKINIT_QUARANTINED:
+ return "(QUARANTINED)";
+ case OPA_LINKINIT_INSUFIC_CAPABILITY:
+ return "(INSUFIC_CAPABILITY)";
+ default:
+ break;
+ }
+ }
+ return "";
+}
+
+/*
+ * driver_physical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
+ * Return -1 (converted to a u32) to indicate error.
+ */
+u32 driver_physical_state(struct hfi1_pportdata *ppd)
+{
+ switch (ppd->host_link_state) {
+ case HLS_UP_INIT:
+ case HLS_UP_ARMED:
+ case HLS_UP_ACTIVE:
+ return IB_PORTPHYSSTATE_LINKUP;
+ case HLS_DN_POLL:
+ return IB_PORTPHYSSTATE_POLLING;
+ case HLS_DN_DISABLE:
+ return IB_PORTPHYSSTATE_DISABLED;
+ case HLS_DN_OFFLINE:
+ return OPA_PORTPHYSSTATE_OFFLINE;
+ case HLS_VERIFY_CAP:
+ return IB_PORTPHYSSTATE_POLLING;
+ case HLS_GOING_UP:
+ return IB_PORTPHYSSTATE_POLLING;
+ case HLS_GOING_OFFLINE:
+ return OPA_PORTPHYSSTATE_OFFLINE;
+ case HLS_LINK_COOLDOWN:
+ return OPA_PORTPHYSSTATE_OFFLINE;
+ case HLS_DN_DOWNDEF:
+ default:
+ dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+ ppd->host_link_state);
+ return -1;
+ }
+}
+
+/*
+ * driver_logical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
+ * (converted to a u32) to indicate error.
+ */
+u32 driver_logical_state(struct hfi1_pportdata *ppd)
+{
+ if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN))
+ return IB_PORT_DOWN;
+
+ switch (ppd->host_link_state & HLS_UP) {
+ case HLS_UP_INIT:
+ return IB_PORT_INIT;
+ case HLS_UP_ARMED:
+ return IB_PORT_ARMED;
+ case HLS_UP_ACTIVE:
+ return IB_PORT_ACTIVE;
+ default:
+ dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+ ppd->host_link_state);
+ return -1;
+ }
+}
+
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+ u8 neigh_reason, u8 rem_reason)
+{
+ if (ppd->local_link_down_reason.latest == 0 &&
+ ppd->neigh_link_down_reason.latest == 0) {
+ ppd->local_link_down_reason.latest = lcl_reason;
+ ppd->neigh_link_down_reason.latest = neigh_reason;
+ ppd->remote_link_down_reason = rem_reason;
+ }
+}
+
+/*
+ * Change the physical and/or logical link state.
+ *
+ * Do not call this routine while inside an interrupt. It contains
+ * calls to routines that can take multiple seconds to finish.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int set_link_state(struct hfi1_pportdata *ppd, u32 state)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct ib_event event = {.device = NULL};
+ int ret1, ret = 0;
+ int orig_new_state, poll_bounce;
+
+ mutex_lock(&ppd->hls_lock);
+
+ orig_new_state = state;
+ if (state == HLS_DN_DOWNDEF)
+ state = dd->link_default;
+
+ /* interpret poll -> poll as a link bounce */
+ poll_bounce = ppd->host_link_state == HLS_DN_POLL &&
+ state == HLS_DN_POLL;
+
+ dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
+ link_state_name(ppd->host_link_state),
+ link_state_name(orig_new_state),
+ poll_bounce ? "(bounce) " : "",
+ link_state_reason_name(ppd, state));
+
+ /*
+ * If we're going to a (HLS_*) link state that implies the logical
+ * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
+ * reset is_sm_config_started to 0.
+ */
+ if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
+ ppd->is_sm_config_started = 0;
+
+ /*
+ * Do nothing if the states match. Let a poll to poll link bounce
+ * go through.
+ */
+ if (ppd->host_link_state == state && !poll_bounce)
+ goto done;
+
+ switch (state) {
+ case HLS_UP_INIT:
+ if (ppd->host_link_state == HLS_DN_POLL &&
+ (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
+ /*
+ * Quick link up jumps from polling to here.
+ *
+ * Whether in normal or loopback mode, the
+ * simulator jumps from polling to link up.
+ * Accept that here.
+ */
+ /* OK */
+ } else if (ppd->host_link_state != HLS_GOING_UP) {
+ goto unexpected;
+ }
+
+ ppd->host_link_state = HLS_UP_INIT;
+ ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
+ if (ret) {
+ /* logical state didn't change, stay at going_up */
+ ppd->host_link_state = HLS_GOING_UP;
+ dd_dev_err(dd,
+ "%s: logical state did not change to INIT\n",
+ __func__);
+ } else {
+ /* clear old transient LINKINIT_REASON code */
+ if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
+ ppd->linkinit_reason =
+ OPA_LINKINIT_REASON_LINKUP;
+
+ /* enable the port */
+ add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+ handle_linkup_change(dd, 1);
+ }
+ break;
+ case HLS_UP_ARMED:
+ if (ppd->host_link_state != HLS_UP_INIT)
+ goto unexpected;
+
+ ppd->host_link_state = HLS_UP_ARMED;
+ set_logical_state(dd, LSTATE_ARMED);
+ ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
+ if (ret) {
+ /* logical state didn't change, stay at init */
+ ppd->host_link_state = HLS_UP_INIT;
+ dd_dev_err(dd,
+ "%s: logical state did not change to ARMED\n",
+ __func__);
+ }
+ /*
+ * The simulator does not currently implement SMA messages,
+ * so neighbor_normal is not set. Set it here when we first
+ * move to Armed.
+ */
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+ ppd->neighbor_normal = 1;
+ break;
+ case HLS_UP_ACTIVE:
+ if (ppd->host_link_state != HLS_UP_ARMED)
+ goto unexpected;
+
+ ppd->host_link_state = HLS_UP_ACTIVE;
+ set_logical_state(dd, LSTATE_ACTIVE);
+ ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
+ if (ret) {
+ /* logical state didn't change, stay at armed */
+ ppd->host_link_state = HLS_UP_ARMED;
+ dd_dev_err(dd,
+ "%s: logical state did not change to ACTIVE\n",
+ __func__);
+ } else {
+ /* tell all engines to go running */
+ sdma_all_running(dd);
+
+ /* Signal the IB layer that the port has went active */
+ event.device = &dd->verbs_dev.rdi.ibdev;
+ event.element.port_num = ppd->port;
+ event.event = IB_EVENT_PORT_ACTIVE;
+ }
+ break;
+ case HLS_DN_POLL:
+ if ((ppd->host_link_state == HLS_DN_DISABLE ||
+ ppd->host_link_state == HLS_DN_OFFLINE) &&
+ dd->dc_shutdown)
+ dc_start(dd);
+ /* Hand LED control to the DC */
+ write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+
+ if (ppd->host_link_state != HLS_DN_OFFLINE) {
+ u8 tmp = ppd->link_enabled;
+
+ ret = goto_offline(ppd, ppd->remote_link_down_reason);
+ if (ret) {
+ ppd->link_enabled = tmp;
+ break;
+ }
+ ppd->remote_link_down_reason = 0;
+
+ if (ppd->driver_link_ready)
+ ppd->link_enabled = 1;
+ }
+
+ set_all_slowpath(ppd->dd);
+ ret = set_local_link_attributes(ppd);
+ if (ret)
+ break;
+
+ ppd->port_error_action = 0;
+ ppd->host_link_state = HLS_DN_POLL;
+
+ if (quick_linkup) {
+ /* quick linkup does not go into polling */
+ ret = do_quick_linkup(dd);
+ } else {
+ ret1 = set_physical_link_state(dd, PLS_POLLING);
+ if (ret1 != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "Failed to transition to Polling link state, return 0x%x\n",
+ ret1);
+ ret = -EINVAL;
+ }
+ }
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+ /*
+ * If an error occurred above, go back to offline. The
+ * caller may reschedule another attempt.
+ */
+ if (ret)
+ goto_offline(ppd, 0);
+ break;
+ case HLS_DN_DISABLE:
+ /* link is disabled */
+ ppd->link_enabled = 0;
+
+ /* allow any state to transition to disabled */
+
+ /* must transition to offline first */
+ if (ppd->host_link_state != HLS_DN_OFFLINE) {
+ ret = goto_offline(ppd, ppd->remote_link_down_reason);
+ if (ret)
+ break;
+ ppd->remote_link_down_reason = 0;
+ }
+
+ ret1 = set_physical_link_state(dd, PLS_DISABLED);
+ if (ret1 != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "Failed to transition to Disabled link state, return 0x%x\n",
+ ret1);
+ ret = -EINVAL;
+ break;
+ }
+ ppd->host_link_state = HLS_DN_DISABLE;
+ dc_shutdown(dd);
+ break;
+ case HLS_DN_OFFLINE:
+ if (ppd->host_link_state == HLS_DN_DISABLE)
+ dc_start(dd);
+
+ /* allow any state to transition to offline */
+ ret = goto_offline(ppd, ppd->remote_link_down_reason);
+ if (!ret)
+ ppd->remote_link_down_reason = 0;
+ break;
+ case HLS_VERIFY_CAP:
+ if (ppd->host_link_state != HLS_DN_POLL)
+ goto unexpected;
+ ppd->host_link_state = HLS_VERIFY_CAP;
+ break;
+ case HLS_GOING_UP:
+ if (ppd->host_link_state != HLS_VERIFY_CAP)
+ goto unexpected;
+
+ ret1 = set_physical_link_state(dd, PLS_LINKUP);
+ if (ret1 != HCMD_SUCCESS) {
+ dd_dev_err(dd,
+ "Failed to transition to link up state, return 0x%x\n",
+ ret1);
+ ret = -EINVAL;
+ break;
+ }
+ ppd->host_link_state = HLS_GOING_UP;
+ break;
+
+ case HLS_GOING_OFFLINE: /* transient within goto_offline() */
+ case HLS_LINK_COOLDOWN: /* transient within goto_offline() */
+ default:
+ dd_dev_info(dd, "%s: state 0x%x: not supported\n",
+ __func__, state);
+ ret = -EINVAL;
+ break;
+ }
+
+ goto done;
+
+unexpected:
+ dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
+ __func__, link_state_name(ppd->host_link_state),
+ link_state_name(state));
+ ret = -EINVAL;
+
+done:
+ mutex_unlock(&ppd->hls_lock);
+
+ if (event.device)
+ ib_dispatch_event(&event);
+
+ return ret;
+}
+
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
+{
+ u64 reg;
+ int ret = 0;
+
+ switch (which) {
+ case HFI1_IB_CFG_LIDLMC:
+ set_lidlmc(ppd);
+ break;
+ case HFI1_IB_CFG_VL_HIGH_LIMIT:
+ /*
+ * The VL Arbitrator high limit is sent in units of 4k
+ * bytes, while HFI stores it in units of 64 bytes.
+ */
+ val *= 4096 / 64;
+ reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
+ << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
+ write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
+ break;
+ case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+ /* HFI only supports POLL as the default link down state */
+ if (val != HLS_DN_POLL)
+ ret = -EINVAL;
+ break;
+ case HFI1_IB_CFG_OP_VLS:
+ if (ppd->vls_operational != val) {
+ ppd->vls_operational = val;
+ if (!ppd->port)
+ ret = -EINVAL;
+ }
+ break;
+ /*
+ * For link width, link width downgrade, and speed enable, always AND
+ * the setting with what is actually supported. This has two benefits.
+ * First, enabled can't have unsupported values, no matter what the
+ * SM or FM might want. Second, the ALL_SUPPORTED wildcards that mean
+ * "fill in with your supported value" have all the bits in the
+ * field set, so simply ANDing with supported has the desired result.
+ */
+ case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
+ ppd->link_width_enabled = val & ppd->link_width_supported;
+ break;
+ case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
+ ppd->link_width_downgrade_enabled =
+ val & ppd->link_width_downgrade_supported;
+ break;
+ case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+ ppd->link_speed_enabled = val & ppd->link_speed_supported;
+ break;
+ case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+ /*
+ * HFI does not follow IB specs, save this value
+ * so we can report it, if asked.
+ */
+ ppd->overrun_threshold = val;
+ break;
+ case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+ /*
+ * HFI does not follow IB specs, save this value
+ * so we can report it, if asked.
+ */
+ ppd->phy_error_threshold = val;
+ break;
+
+ case HFI1_IB_CFG_MTU:
+ set_send_length(ppd);
+ break;
+
+ case HFI1_IB_CFG_PKEYS:
+ if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+ set_partition_keys(ppd);
+ break;
+
+ default:
+ if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+ dd_dev_info(ppd->dd,
+ "%s: which %s, val 0x%x: not implemented\n",
+ __func__, ib_cfg_name(which), val);
+ break;
+ }
+ return ret;
+}
+
+/* begin functions related to vl arbitration table caching */
+static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
+{
+ int i;
+
+ BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+ VL_ARB_LOW_PRIO_TABLE_SIZE);
+ BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+ VL_ARB_HIGH_PRIO_TABLE_SIZE);
+
+ /*
+ * Note that we always return values directly from the
+ * 'vl_arb_cache' (and do no CSR reads) in response to a
+ * 'Get(VLArbTable)'. This is obviously correct after a
+ * 'Set(VLArbTable)', since the cache will then be up to
+ * date. But it's also correct prior to any 'Set(VLArbTable)'
+ * since then both the cache, and the relevant h/w registers
+ * will be zeroed.
+ */
+
+ for (i = 0; i < MAX_PRIO_TABLE; i++)
+ spin_lock_init(&ppd->vl_arb_cache[i].lock);
+}
+
+/*
+ * vl_arb_lock_cache
+ *
+ * All other vl_arb_* functions should be called only after locking
+ * the cache.
+ */
+static inline struct vl_arb_cache *
+vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+ if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
+ return NULL;
+ spin_lock(&ppd->vl_arb_cache[idx].lock);
+ return &ppd->vl_arb_cache[idx];
+}
+
+static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+ spin_unlock(&ppd->vl_arb_cache[idx].lock);
+}
+
+static void vl_arb_get_cache(struct vl_arb_cache *cache,
+ struct ib_vl_weight_elem *vl)
+{
+ memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static void vl_arb_set_cache(struct vl_arb_cache *cache,
+ struct ib_vl_weight_elem *vl)
+{
+ memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static int vl_arb_match_cache(struct vl_arb_cache *cache,
+ struct ib_vl_weight_elem *vl)
+{
+ return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+/* end functions related to vl arbitration table caching */
+
+static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
+ u32 size, struct ib_vl_weight_elem *vl)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 reg;
+ unsigned int i, is_up = 0;
+ int drain, ret = 0;
+
+ mutex_lock(&ppd->hls_lock);
+
+ if (ppd->host_link_state & HLS_UP)
+ is_up = 1;
+
+ drain = !is_ax(dd) && is_up;
+
+ if (drain)
+ /*
+ * Before adjusting VL arbitration weights, empty per-VL
+ * FIFOs, otherwise a packet whose VL weight is being
+ * set to 0 could get stuck in a FIFO with no chance to
+ * egress.
+ */
+ ret = stop_drain_data_vls(dd);
+
+ if (ret) {
+ dd_dev_err(
+ dd,
+ "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
+ __func__);
+ goto err;
+ }
+
+ for (i = 0; i < size; i++, vl++) {
+ /*
+ * NOTE: The low priority shift and mask are used here, but
+ * they are the same for both the low and high registers.
+ */
+ reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
+ << SEND_LOW_PRIORITY_LIST_VL_SHIFT)
+ | (((u64)vl->weight
+ & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
+ << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
+ write_csr(dd, target + (i * 8), reg);
+ }
+ pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
+
+ if (drain)
+ open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+ mutex_unlock(&ppd->hls_lock);
+
+ return ret;
+}
+
+/*
+ * Read one credit merge VL register.
+ */
+static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
+ struct vl_limit *vll)
+{
+ u64 reg = read_csr(dd, csr);
+
+ vll->dedicated = cpu_to_be16(
+ (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
+ & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
+ vll->shared = cpu_to_be16(
+ (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
+ & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
+}
+
+/*
+ * Read the current credit merge limits.
+ */
+static int get_buffer_control(struct hfi1_devdata *dd,
+ struct buffer_control *bc, u16 *overall_limit)
+{
+ u64 reg;
+ int i;
+
+ /* not all entries are filled in */
+ memset(bc, 0, sizeof(*bc));
+
+ /* OPA and HFI have a 1-1 mapping */
+ for (i = 0; i < TXE_NUM_DATA_VL; i++)
+ read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]);
+
+ /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
+ read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
+
+ reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+ bc->overall_shared_limit = cpu_to_be16(
+ (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+ & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
+ if (overall_limit)
+ *overall_limit = (reg
+ >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+ & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
+ return sizeof(struct buffer_control);
+}
+
+static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+ u64 reg;
+ int i;
+
+ /* each register contains 16 SC->VLnt mappings, 4 bits each */
+ reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
+ for (i = 0; i < sizeof(u64); i++) {
+ u8 byte = *(((u8 *)&reg) + i);
+
+ dp->vlnt[2 * i] = byte & 0xf;
+ dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
+ }
+
+ reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
+ for (i = 0; i < sizeof(u64); i++) {
+ u8 byte = *(((u8 *)&reg) + i);
+
+ dp->vlnt[16 + (2 * i)] = byte & 0xf;
+ dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
+ }
+ return sizeof(struct sc2vlnt);
+}
+
+static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
+ struct ib_vl_weight_elem *vl)
+{
+ unsigned int i;
+
+ for (i = 0; i < nelems; i++, vl++) {
+ vl->vl = 0xf;
+ vl->weight = 0;
+ }
+}
+
+static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+ write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
+ DC_SC_VL_VAL(15_0,
+ 0, dp->vlnt[0] & 0xf,
+ 1, dp->vlnt[1] & 0xf,
+ 2, dp->vlnt[2] & 0xf,
+ 3, dp->vlnt[3] & 0xf,
+ 4, dp->vlnt[4] & 0xf,
+ 5, dp->vlnt[5] & 0xf,
+ 6, dp->vlnt[6] & 0xf,
+ 7, dp->vlnt[7] & 0xf,
+ 8, dp->vlnt[8] & 0xf,
+ 9, dp->vlnt[9] & 0xf,
+ 10, dp->vlnt[10] & 0xf,
+ 11, dp->vlnt[11] & 0xf,
+ 12, dp->vlnt[12] & 0xf,
+ 13, dp->vlnt[13] & 0xf,
+ 14, dp->vlnt[14] & 0xf,
+ 15, dp->vlnt[15] & 0xf));
+ write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
+ DC_SC_VL_VAL(31_16,
+ 16, dp->vlnt[16] & 0xf,
+ 17, dp->vlnt[17] & 0xf,
+ 18, dp->vlnt[18] & 0xf,
+ 19, dp->vlnt[19] & 0xf,
+ 20, dp->vlnt[20] & 0xf,
+ 21, dp->vlnt[21] & 0xf,
+ 22, dp->vlnt[22] & 0xf,
+ 23, dp->vlnt[23] & 0xf,
+ 24, dp->vlnt[24] & 0xf,
+ 25, dp->vlnt[25] & 0xf,
+ 26, dp->vlnt[26] & 0xf,
+ 27, dp->vlnt[27] & 0xf,
+ 28, dp->vlnt[28] & 0xf,
+ 29, dp->vlnt[29] & 0xf,
+ 30, dp->vlnt[30] & 0xf,
+ 31, dp->vlnt[31] & 0xf));
+}
+
+static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
+ u16 limit)
+{
+ if (limit != 0)
+ dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
+ what, (int)limit, idx);
+}
+
+/* change only the shared limit portion of SendCmGLobalCredit */
+static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
+{
+ u64 reg;
+
+ reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+ reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
+ reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
+ write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* change only the total credit limit portion of SendCmGLobalCredit */
+static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
+{
+ u64 reg;
+
+ reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+ reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
+ reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
+ write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* set the given per-VL shared limit */
+static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+ u64 reg;
+ u32 addr;
+
+ if (vl < TXE_NUM_DATA_VL)
+ addr = SEND_CM_CREDIT_VL + (8 * vl);
+ else
+ addr = SEND_CM_CREDIT_VL15;
+
+ reg = read_csr(dd, addr);
+ reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
+ reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
+ write_csr(dd, addr, reg);
+}
+
+/* set the given per-VL dedicated limit */
+static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+ u64 reg;
+ u32 addr;
+
+ if (vl < TXE_NUM_DATA_VL)
+ addr = SEND_CM_CREDIT_VL + (8 * vl);
+ else
+ addr = SEND_CM_CREDIT_VL15;
+
+ reg = read_csr(dd, addr);
+ reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
+ reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
+ write_csr(dd, addr, reg);
+}
+
+/* spin until the given per-VL status mask bits clear */
+static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
+ const char *which)
+{
+ unsigned long timeout;
+ u64 reg;
+
+ timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
+ while (1) {
+ reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
+
+ if (reg == 0)
+ return; /* success */
+ if (time_after(jiffies, timeout))
+ break; /* timed out */
+ udelay(1);
+ }
+
+ dd_dev_err(dd,
+ "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
+ which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
+ /*
+ * If this occurs, it is likely there was a credit loss on the link.
+ * The only recovery from that is a link bounce.
+ */
+ dd_dev_err(dd,
+ "Continuing anyway. A credit loss may occur. Suggest a link bounce\n");
+}
+
+/*
+ * The number of credits on the VLs may be changed while everything
+ * is "live", but the following algorithm must be followed due to
+ * how the hardware is actually implemented. In particular,
+ * Return_Credit_Status[] is the only correct status check.
+ *
+ * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
+ * set Global_Shared_Credit_Limit = 0
+ * use_all_vl = 1
+ * mask0 = all VLs that are changing either dedicated or shared limits
+ * set Shared_Limit[mask0] = 0
+ * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
+ * if (changing any dedicated limit)
+ * mask1 = all VLs that are lowering dedicated limits
+ * lower Dedicated_Limit[mask1]
+ * spin until Return_Credit_Status[mask1] == 0
+ * raise Dedicated_Limits
+ * raise Shared_Limits
+ * raise Global_Shared_Credit_Limit
+ *
+ * lower = if the new limit is lower, set the limit to the new value
+ * raise = if the new limit is higher than the current value (may be changed
+ * earlier in the algorithm), set the new limit to the new value
+ */
+int set_buffer_control(struct hfi1_pportdata *ppd,
+ struct buffer_control *new_bc)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 changing_mask, ld_mask, stat_mask;
+ int change_count;
+ int i, use_all_mask;
+ int this_shared_changing;
+ int vl_count = 0, ret;
+ /*
+ * A0: add the variable any_shared_limit_changing below and in the
+ * algorithm above. If removing A0 support, it can be removed.
+ */
+ int any_shared_limit_changing;
+ struct buffer_control cur_bc;
+ u8 changing[OPA_MAX_VLS];
+ u8 lowering_dedicated[OPA_MAX_VLS];
+ u16 cur_total;
+ u32 new_total = 0;
+ const u64 all_mask =
+ SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
+ | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
+
+#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
+#define NUM_USABLE_VLS 16 /* look at VL15 and less */
+
+ /* find the new total credits, do sanity check on unused VLs */
+ for (i = 0; i < OPA_MAX_VLS; i++) {
+ if (valid_vl(i)) {
+ new_total += be16_to_cpu(new_bc->vl[i].dedicated);
+ continue;
+ }
+ nonzero_msg(dd, i, "dedicated",
+ be16_to_cpu(new_bc->vl[i].dedicated));
+ nonzero_msg(dd, i, "shared",
+ be16_to_cpu(new_bc->vl[i].shared));
+ new_bc->vl[i].dedicated = 0;
+ new_bc->vl[i].shared = 0;
+ }
+ new_total += be16_to_cpu(new_bc->overall_shared_limit);
+
+ /* fetch the current values */
+ get_buffer_control(dd, &cur_bc, &cur_total);
+
+ /*
+ * Create the masks we will use.
+ */
+ memset(changing, 0, sizeof(changing));
+ memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
+ /*
+ * NOTE: Assumes that the individual VL bits are adjacent and in
+ * increasing order
+ */
+ stat_mask =
+ SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
+ changing_mask = 0;
+ ld_mask = 0;
+ change_count = 0;
+ any_shared_limit_changing = 0;
+ for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
+ if (!valid_vl(i))
+ continue;
+ this_shared_changing = new_bc->vl[i].shared
+ != cur_bc.vl[i].shared;
+ if (this_shared_changing)
+ any_shared_limit_changing = 1;
+ if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated ||
+ this_shared_changing) {
+ changing[i] = 1;
+ changing_mask |= stat_mask;
+ change_count++;
+ }
+ if (be16_to_cpu(new_bc->vl[i].dedicated) <
+ be16_to_cpu(cur_bc.vl[i].dedicated)) {
+ lowering_dedicated[i] = 1;
+ ld_mask |= stat_mask;
+ }
+ }
+
+ /* bracket the credit change with a total adjustment */
+ if (new_total > cur_total)
+ set_global_limit(dd, new_total);
+
+ /*
+ * Start the credit change algorithm.
+ */
+ use_all_mask = 0;
+ if ((be16_to_cpu(new_bc->overall_shared_limit) <
+ be16_to_cpu(cur_bc.overall_shared_limit)) ||
+ (is_ax(dd) && any_shared_limit_changing)) {
+ set_global_shared(dd, 0);
+ cur_bc.overall_shared_limit = 0;
+ use_all_mask = 1;
+ }
+
+ for (i = 0; i < NUM_USABLE_VLS; i++) {
+ if (!valid_vl(i))
+ continue;
+
+ if (changing[i]) {
+ set_vl_shared(dd, i, 0);
+ cur_bc.vl[i].shared = 0;
+ }
+ }
+
+ wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
+ "shared");
+
+ if (change_count > 0) {
+ for (i = 0; i < NUM_USABLE_VLS; i++) {
+ if (!valid_vl(i))
+ continue;
+
+ if (lowering_dedicated[i]) {
+ set_vl_dedicated(dd, i,
+ be16_to_cpu(new_bc->
+ vl[i].dedicated));
+ cur_bc.vl[i].dedicated =
+ new_bc->vl[i].dedicated;
+ }
+ }
+
+ wait_for_vl_status_clear(dd, ld_mask, "dedicated");
+
+ /* now raise all dedicated that are going up */
+ for (i = 0; i < NUM_USABLE_VLS; i++) {
+ if (!valid_vl(i))
+ continue;
+
+ if (be16_to_cpu(new_bc->vl[i].dedicated) >
+ be16_to_cpu(cur_bc.vl[i].dedicated))
+ set_vl_dedicated(dd, i,
+ be16_to_cpu(new_bc->
+ vl[i].dedicated));
+ }
+ }
+
+ /* next raise all shared that are going up */
+ for (i = 0; i < NUM_USABLE_VLS; i++) {
+ if (!valid_vl(i))
+ continue;
+
+ if (be16_to_cpu(new_bc->vl[i].shared) >
+ be16_to_cpu(cur_bc.vl[i].shared))
+ set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
+ }
+
+ /* finally raise the global shared */
+ if (be16_to_cpu(new_bc->overall_shared_limit) >
+ be16_to_cpu(cur_bc.overall_shared_limit))
+ set_global_shared(dd,
+ be16_to_cpu(new_bc->overall_shared_limit));
+
+ /* bracket the credit change with a total adjustment */
+ if (new_total < cur_total)
+ set_global_limit(dd, new_total);
+
+ /*
+ * Determine the actual number of operational VLS using the number of
+ * dedicated and shared credits for each VL.
+ */
+ if (change_count > 0) {
+ for (i = 0; i < TXE_NUM_DATA_VL; i++)
+ if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 ||
+ be16_to_cpu(new_bc->vl[i].shared) > 0)
+ vl_count++;
+ ppd->actual_vls_operational = vl_count;
+ ret = sdma_map_init(dd, ppd->port - 1, vl_count ?
+ ppd->actual_vls_operational :
+ ppd->vls_operational,
+ NULL);
+ if (ret == 0)
+ ret = pio_map_init(dd, ppd->port - 1, vl_count ?
+ ppd->actual_vls_operational :
+ ppd->vls_operational, NULL);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Read the given fabric manager table. Return the size of the
+ * table (in bytes) on success, and a negative error code on
+ * failure.
+ */
+int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
+
+{
+ int size;
+ struct vl_arb_cache *vlc;
+
+ switch (which) {
+ case FM_TBL_VL_HIGH_ARB:
+ size = 256;
+ /*
+ * OPA specifies 128 elements (of 2 bytes each), though
+ * HFI supports only 16 elements in h/w.
+ */
+ vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+ vl_arb_get_cache(vlc, t);
+ vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+ break;
+ case FM_TBL_VL_LOW_ARB:
+ size = 256;
+ /*
+ * OPA specifies 128 elements (of 2 bytes each), though
+ * HFI supports only 16 elements in h/w.
+ */
+ vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+ vl_arb_get_cache(vlc, t);
+ vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+ break;
+ case FM_TBL_BUFFER_CONTROL:
+ size = get_buffer_control(ppd->dd, t, NULL);
+ break;
+ case FM_TBL_SC2VLNT:
+ size = get_sc2vlnt(ppd->dd, t);
+ break;
+ case FM_TBL_VL_PREEMPT_ELEMS:
+ size = 256;
+ /* OPA specifies 128 elements, of 2 bytes each */
+ get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
+ break;
+ case FM_TBL_VL_PREEMPT_MATRIX:
+ size = 256;
+ /*
+ * OPA specifies that this is the same size as the VL
+ * arbitration tables (i.e., 256 bytes).
+ */
+ break;
+ default:
+ return -EINVAL;
+ }
+ return size;
+}
+
+/*
+ * Write the given fabric manager table.
+ */
+int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
+{
+ int ret = 0;
+ struct vl_arb_cache *vlc;
+
+ switch (which) {
+ case FM_TBL_VL_HIGH_ARB:
+ vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+ if (vl_arb_match_cache(vlc, t)) {
+ vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+ break;
+ }
+ vl_arb_set_cache(vlc, t);
+ vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+ ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
+ VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
+ break;
+ case FM_TBL_VL_LOW_ARB:
+ vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+ if (vl_arb_match_cache(vlc, t)) {
+ vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+ break;
+ }
+ vl_arb_set_cache(vlc, t);
+ vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+ ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
+ VL_ARB_LOW_PRIO_TABLE_SIZE, t);
+ break;
+ case FM_TBL_BUFFER_CONTROL:
+ ret = set_buffer_control(ppd, t);
+ break;
+ case FM_TBL_SC2VLNT:
+ set_sc2vlnt(ppd->dd, t);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+/*
+ * Disable all data VLs.
+ *
+ * Return 0 if disabled, non-zero if the VLs cannot be disabled.
+ */
+static int disable_data_vls(struct hfi1_devdata *dd)
+{
+ if (is_ax(dd))
+ return 1;
+
+ pio_send_control(dd, PSC_DATA_VL_DISABLE);
+
+ return 0;
+}
+
+/*
+ * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
+ * Just re-enables all data VLs (the "fill" part happens
+ * automatically - the name was chosen for symmetry with
+ * stop_drain_data_vls()).
+ *
+ * Return 0 if successful, non-zero if the VLs cannot be enabled.
+ */
+int open_fill_data_vls(struct hfi1_devdata *dd)
+{
+ if (is_ax(dd))
+ return 1;
+
+ pio_send_control(dd, PSC_DATA_VL_ENABLE);
+
+ return 0;
+}
+
+/*
+ * drain_data_vls() - assumes that disable_data_vls() has been called,
+ * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
+ * engines to drop to 0.
+ */
+static void drain_data_vls(struct hfi1_devdata *dd)
+{
+ sc_wait(dd);
+ sdma_wait(dd);
+ pause_for_credit_return(dd);
+}
+
+/*
+ * stop_drain_data_vls() - disable, then drain all per-VL fifos.
+ *
+ * Use open_fill_data_vls() to resume using data VLs. This pair is
+ * meant to be used like this:
+ *
+ * stop_drain_data_vls(dd);
+ * // do things with per-VL resources
+ * open_fill_data_vls(dd);
+ */
+int stop_drain_data_vls(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ ret = disable_data_vls(dd);
+ if (ret == 0)
+ drain_data_vls(dd);
+
+ return ret;
+}
+
+/*
+ * Convert a nanosecond time to a cclock count. No matter how slow
+ * the cclock, a non-zero ns will always have a non-zero result.
+ */
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
+{
+ u32 cclocks;
+
+ if (dd->icode == ICODE_FPGA_EMULATION)
+ cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
+ else /* simulation pretends to be ASIC */
+ cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
+ if (ns && !cclocks) /* if ns nonzero, must be at least 1 */
+ cclocks = 1;
+ return cclocks;
+}
+
+/*
+ * Convert a cclock count to nanoseconds. Not matter how slow
+ * the cclock, a non-zero cclocks will always have a non-zero result.
+ */
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
+{
+ u32 ns;
+
+ if (dd->icode == ICODE_FPGA_EMULATION)
+ ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
+ else /* simulation pretends to be ASIC */
+ ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
+ if (cclocks && !ns)
+ ns = 1;
+ return ns;
+}
+
+/*
+ * Dynamically adjust the receive interrupt timeout for a context based on
+ * incoming packet rate.
+ *
+ * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
+ */
+static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 timeout = rcd->rcvavail_timeout;
+
+ /*
+ * This algorithm doubles or halves the timeout depending on whether
+ * the number of packets received in this interrupt were less than or
+ * greater equal the interrupt count.
+ *
+ * The calculations below do not allow a steady state to be achieved.
+ * Only at the endpoints it is possible to have an unchanging
+ * timeout.
+ */
+ if (npkts < rcv_intr_count) {
+ /*
+ * Not enough packets arrived before the timeout, adjust
+ * timeout downward.
+ */
+ if (timeout < 2) /* already at minimum? */
+ return;
+ timeout >>= 1;
+ } else {
+ /*
+ * More than enough packets arrived before the timeout, adjust
+ * timeout upward.
+ */
+ if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
+ return;
+ timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
+ }
+
+ rcd->rcvavail_timeout = timeout;
+ /*
+ * timeout cannot be larger than rcv_intr_timeout_csr which has already
+ * been verified to be in range
+ */
+ write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
+ (u64)timeout <<
+ RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+}
+
+void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
+ u32 intr_adjust, u32 npkts)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u64 reg;
+ u32 ctxt = rcd->ctxt;
+
+ /*
+ * Need to write timeout register before updating RcvHdrHead to ensure
+ * that a new value is used when the HW decides to restart counting.
+ */
+ if (intr_adjust)
+ adjust_rcv_timeout(rcd, npkts);
+ if (updegr) {
+ reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
+ << RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
+ write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
+ }
+ mmiowb();
+ reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
+ (((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
+ << RCV_HDR_HEAD_HEAD_SHIFT);
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+ mmiowb();
+}
+
+u32 hdrqempty(struct hfi1_ctxtdata *rcd)
+{
+ u32 head, tail;
+
+ head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
+ & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
+
+ if (rcd->rcvhdrtail_kvaddr)
+ tail = get_rcvhdrtail(rcd);
+ else
+ tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+
+ return head == tail;
+}
+
+/*
+ * Context Control and Receive Array encoding for buffer size:
+ * 0x0 invalid
+ * 0x1 4 KB
+ * 0x2 8 KB
+ * 0x3 16 KB
+ * 0x4 32 KB
+ * 0x5 64 KB
+ * 0x6 128 KB
+ * 0x7 256 KB
+ * 0x8 512 KB (Receive Array only)
+ * 0x9 1 MB (Receive Array only)
+ * 0xa 2 MB (Receive Array only)
+ *
+ * 0xB-0xF - reserved (Receive Array only)
+ *
+ *
+ * This routine assumes that the value has already been sanity checked.
+ */
+static u32 encoded_size(u32 size)
+{
+ switch (size) {
+ case 4 * 1024: return 0x1;
+ case 8 * 1024: return 0x2;
+ case 16 * 1024: return 0x3;
+ case 32 * 1024: return 0x4;
+ case 64 * 1024: return 0x5;
+ case 128 * 1024: return 0x6;
+ case 256 * 1024: return 0x7;
+ case 512 * 1024: return 0x8;
+ case 1 * 1024 * 1024: return 0x9;
+ case 2 * 1024 * 1024: return 0xa;
+ }
+ return 0x1; /* if invalid, go with the minimum size */
+}
+
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
+{
+ struct hfi1_ctxtdata *rcd;
+ u64 rcvctrl, reg;
+ int did_enable = 0;
+
+ rcd = dd->rcd[ctxt];
+ if (!rcd)
+ return;
+
+ hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
+
+ rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
+ /* if the context already enabled, don't do the extra steps */
+ if ((op & HFI1_RCVCTRL_CTXT_ENB) &&
+ !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
+ /* reset the tail and hdr addresses, and sequence count */
+ write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
+ rcd->rcvhdrq_phys);
+ if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+ write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+ rcd->rcvhdrqtailaddr_phys);
+ rcd->seq_cnt = 1;
+
+ /* reset the cached receive header queue head value */
+ rcd->head = 0;
+
+ /*
+ * Zero the receive header queue so we don't get false
+ * positives when checking the sequence number. The
+ * sequence numbers could land exactly on the same spot.
+ * E.g. a rcd restart before the receive header wrapped.
+ */
+ memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+
+ /* starting timeout */
+ rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
+
+ /* enable the context */
+ rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
+
+ /* clean the egr buffer size first */
+ rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+ rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
+ & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
+ << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
+
+ /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
+ did_enable = 1;
+
+ /* zero RcvEgrIndexHead */
+ write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
+
+ /* set eager count and base index */
+ reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
+ & RCV_EGR_CTRL_EGR_CNT_MASK)
+ << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
+ (((rcd->eager_base >> RCV_SHIFT)
+ & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
+ << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
+ write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
+
+ /*
+ * Set TID (expected) count and base index.
+ * rcd->expected_count is set to individual RcvArray entries,
+ * not pairs, and the CSR takes a pair-count in groups of
+ * four, so divide by 8.
+ */
+ reg = (((rcd->expected_count >> RCV_SHIFT)
+ & RCV_TID_CTRL_TID_PAIR_CNT_MASK)
+ << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
+ (((rcd->expected_base >> RCV_SHIFT)
+ & RCV_TID_CTRL_TID_BASE_INDEX_MASK)
+ << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
+ write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
+ if (ctxt == HFI1_CTRL_CTXT)
+ write_csr(dd, RCV_VL15, HFI1_CTRL_CTXT);
+ }
+ if (op & HFI1_RCVCTRL_CTXT_DIS) {
+ write_csr(dd, RCV_VL15, 0);
+ /*
+ * When receive context is being disabled turn on tail
+ * update with a dummy tail address and then disable
+ * receive context.
+ */
+ if (dd->rcvhdrtail_dummy_physaddr) {
+ write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+ dd->rcvhdrtail_dummy_physaddr);
+ /* Enabling RcvCtxtCtrl.TailUpd is intentional. */
+ rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+ }
+
+ rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
+ }
+ if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+ rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+ if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+ if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
+ rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+ if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
+ /* See comment on RcvCtxtCtrl.TailUpd above */
+ if (!(op & HFI1_RCVCTRL_CTXT_DIS))
+ rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+ }
+ if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
+ rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+ if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+ if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
+ /*
+ * In one-packet-per-eager mode, the size comes from
+ * the RcvArray entry.
+ */
+ rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+ rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+ }
+ if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+ if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
+ rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+ if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+ if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
+ rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+ if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
+ rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+ rcd->rcvctrl = rcvctrl;
+ hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
+ write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
+
+ /* work around sticky RcvCtxtStatus.BlockedRHQFull */
+ if (did_enable &&
+ (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
+ reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+ if (reg != 0) {
+ dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
+ ctxt, reg);
+ read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
+ read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+ reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+ dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
+ ctxt, reg, reg == 0 ? "not" : "still");
+ }
+ }
+
+ if (did_enable) {
+ /*
+ * The interrupt timeout and count must be set after
+ * the context is enabled to take effect.
+ */
+ /* set interrupt timeout */
+ write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
+ (u64)rcd->rcvavail_timeout <<
+ RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+
+ /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
+ reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
+ write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+ }
+
+ if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
+ /*
+ * If the context has been disabled and the Tail Update has
+ * been cleared, set the RCV_HDR_TAIL_ADDR CSR to dummy address
+ * so it doesn't contain an address that is invalid.
+ */
+ write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+ dd->rcvhdrtail_dummy_physaddr);
+}
+
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp)
+{
+ int ret;
+ u64 val = 0;
+
+ if (namep) {
+ ret = dd->cntrnameslen;
+ *namep = dd->cntrnames;
+ } else {
+ const struct cntr_entry *entry;
+ int i, j;
+
+ ret = (dd->ndevcntrs) * sizeof(u64);
+
+ /* Get the start of the block of counters */
+ *cntrp = dd->cntrs;
+
+ /*
+ * Now go and fill in each counter in the block.
+ */
+ for (i = 0; i < DEV_CNTR_LAST; i++) {
+ entry = &dev_cntrs[i];
+ hfi1_cdbg(CNTR, "reading %s", entry->name);
+ if (entry->flags & CNTR_DISABLED) {
+ /* Nothing */
+ hfi1_cdbg(CNTR, "\tDisabled\n");
+ } else {
+ if (entry->flags & CNTR_VL) {
+ hfi1_cdbg(CNTR, "\tPer VL\n");
+ for (j = 0; j < C_VL_COUNT; j++) {
+ val = entry->rw_cntr(entry,
+ dd, j,
+ CNTR_MODE_R,
+ 0);
+ hfi1_cdbg(
+ CNTR,
+ "\t\tRead 0x%llx for %d\n",
+ val, j);
+ dd->cntrs[entry->offset + j] =
+ val;
+ }
+ } else if (entry->flags & CNTR_SDMA) {
+ hfi1_cdbg(CNTR,
+ "\t Per SDMA Engine\n");
+ for (j = 0; j < dd->chip_sdma_engines;
+ j++) {
+ val =
+ entry->rw_cntr(entry, dd, j,
+ CNTR_MODE_R, 0);
+ hfi1_cdbg(CNTR,
+ "\t\tRead 0x%llx for %d\n",
+ val, j);
+ dd->cntrs[entry->offset + j] =
+ val;
+ }
+ } else {
+ val = entry->rw_cntr(entry, dd,
+ CNTR_INVALID_VL,
+ CNTR_MODE_R, 0);
+ dd->cntrs[entry->offset] = val;
+ hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+ }
+ }
+ }
+ }
+ return ret;
+}
+
+/*
+ * Used by sysfs to create files for hfi stats to read
+ */
+u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp)
+{
+ int ret;
+ u64 val = 0;
+
+ if (namep) {
+ ret = ppd->dd->portcntrnameslen;
+ *namep = ppd->dd->portcntrnames;
+ } else {
+ const struct cntr_entry *entry;
+ int i, j;
+
+ ret = ppd->dd->nportcntrs * sizeof(u64);
+ *cntrp = ppd->cntrs;
+
+ for (i = 0; i < PORT_CNTR_LAST; i++) {
+ entry = &port_cntrs[i];
+ hfi1_cdbg(CNTR, "reading %s", entry->name);
+ if (entry->flags & CNTR_DISABLED) {
+ /* Nothing */
+ hfi1_cdbg(CNTR, "\tDisabled\n");
+ continue;
+ }
+
+ if (entry->flags & CNTR_VL) {
+ hfi1_cdbg(CNTR, "\tPer VL");
+ for (j = 0; j < C_VL_COUNT; j++) {
+ val = entry->rw_cntr(entry, ppd, j,
+ CNTR_MODE_R,
+ 0);
+ hfi1_cdbg(
+ CNTR,
+ "\t\tRead 0x%llx for %d",
+ val, j);
+ ppd->cntrs[entry->offset + j] = val;
+ }
+ } else {
+ val = entry->rw_cntr(entry, ppd,
+ CNTR_INVALID_VL,
+ CNTR_MODE_R,
+ 0);
+ ppd->cntrs[entry->offset] = val;
+ hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+ }
+ }
+ }
+ return ret;
+}
+
+static void free_cntrs(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+
+ if (dd->synth_stats_timer.data)
+ del_timer_sync(&dd->synth_stats_timer);
+ dd->synth_stats_timer.data = 0;
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ kfree(ppd->cntrs);
+ kfree(ppd->scntrs);
+ free_percpu(ppd->ibport_data.rvp.rc_acks);
+ free_percpu(ppd->ibport_data.rvp.rc_qacks);
+ free_percpu(ppd->ibport_data.rvp.rc_delayed_comp);
+ ppd->cntrs = NULL;
+ ppd->scntrs = NULL;
+ ppd->ibport_data.rvp.rc_acks = NULL;
+ ppd->ibport_data.rvp.rc_qacks = NULL;
+ ppd->ibport_data.rvp.rc_delayed_comp = NULL;
+ }
+ kfree(dd->portcntrnames);
+ dd->portcntrnames = NULL;
+ kfree(dd->cntrs);
+ dd->cntrs = NULL;
+ kfree(dd->scntrs);
+ dd->scntrs = NULL;
+ kfree(dd->cntrnames);
+ dd->cntrnames = NULL;
+}
+
+static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
+ u64 *psval, void *context, int vl)
+{
+ u64 val;
+ u64 sval = *psval;
+
+ if (entry->flags & CNTR_DISABLED) {
+ dd_dev_err(dd, "Counter %s not enabled", entry->name);
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+ val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
+
+ /* If its a synthetic counter there is more work we need to do */
+ if (entry->flags & CNTR_SYNTH) {
+ if (sval == CNTR_MAX) {
+ /* No need to read already saturated */
+ return CNTR_MAX;
+ }
+
+ if (entry->flags & CNTR_32BIT) {
+ /* 32bit counters can wrap multiple times */
+ u64 upper = sval >> 32;
+ u64 lower = (sval << 32) >> 32;
+
+ if (lower > val) { /* hw wrapped */
+ if (upper == CNTR_32BIT_MAX)
+ val = CNTR_MAX;
+ else
+ upper++;
+ }
+
+ if (val != CNTR_MAX)
+ val = (upper << 32) | val;
+
+ } else {
+ /* If we rolled we are saturated */
+ if ((val < sval) || (val > CNTR_MAX))
+ val = CNTR_MAX;
+ }
+ }
+
+ *psval = val;
+
+ hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+ return val;
+}
+
+static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
+ struct cntr_entry *entry,
+ u64 *psval, void *context, int vl, u64 data)
+{
+ u64 val;
+
+ if (entry->flags & CNTR_DISABLED) {
+ dd_dev_err(dd, "Counter %s not enabled", entry->name);
+ return 0;
+ }
+
+ hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+ if (entry->flags & CNTR_SYNTH) {
+ *psval = data;
+ if (entry->flags & CNTR_32BIT) {
+ val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+ (data << 32) >> 32);
+ val = data; /* return the full 64bit value */
+ } else {
+ val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+ data);
+ }
+ } else {
+ val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
+ }
+
+ *psval = val;
+
+ hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+ return val;
+}
+
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
+{
+ struct cntr_entry *entry;
+ u64 *sval;
+
+ entry = &dev_cntrs[index];
+ sval = dd->scntrs + entry->offset;
+
+ if (vl != CNTR_INVALID_VL)
+ sval += vl;
+
+ return read_dev_port_cntr(dd, entry, sval, dd, vl);
+}
+
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
+{
+ struct cntr_entry *entry;
+ u64 *sval;
+
+ entry = &dev_cntrs[index];
+ sval = dd->scntrs + entry->offset;
+
+ if (vl != CNTR_INVALID_VL)
+ sval += vl;
+
+ return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
+}
+
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
+{
+ struct cntr_entry *entry;
+ u64 *sval;
+
+ entry = &port_cntrs[index];
+ sval = ppd->scntrs + entry->offset;
+
+ if (vl != CNTR_INVALID_VL)
+ sval += vl;
+
+ if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+ (index <= C_RCV_HDR_OVF_LAST)) {
+ /* We do not want to bother for disabled contexts */
+ return 0;
+ }
+
+ return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
+}
+
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
+{
+ struct cntr_entry *entry;
+ u64 *sval;
+
+ entry = &port_cntrs[index];
+ sval = ppd->scntrs + entry->offset;
+
+ if (vl != CNTR_INVALID_VL)
+ sval += vl;
+
+ if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+ (index <= C_RCV_HDR_OVF_LAST)) {
+ /* We do not want to bother for disabled contexts */
+ return 0;
+ }
+
+ return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
+}
+
+static void update_synth_timer(unsigned long opaque)
+{
+ u64 cur_tx;
+ u64 cur_rx;
+ u64 total_flits;
+ u8 update = 0;
+ int i, j, vl;
+ struct hfi1_pportdata *ppd;
+ struct cntr_entry *entry;
+
+ struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+
+ /*
+ * Rather than keep beating on the CSRs pick a minimal set that we can
+ * check to watch for potential roll over. We can do this by looking at
+ * the number of flits sent/recv. If the total flits exceeds 32bits then
+ * we have to iterate all the counters and update.
+ */
+ entry = &dev_cntrs[C_DC_RCV_FLITS];
+ cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+ entry = &dev_cntrs[C_DC_XMIT_FLITS];
+ cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+ hfi1_cdbg(
+ CNTR,
+ "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
+ dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
+
+ if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
+ /*
+ * May not be strictly necessary to update but it won't hurt and
+ * simplifies the logic here.
+ */
+ update = 1;
+ hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
+ dd->unit);
+ } else {
+ total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
+ hfi1_cdbg(CNTR,
+ "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
+ total_flits, (u64)CNTR_32BIT_MAX);
+ if (total_flits >= CNTR_32BIT_MAX) {
+ hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
+ dd->unit);
+ update = 1;
+ }
+ }
+
+ if (update) {
+ hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
+ for (i = 0; i < DEV_CNTR_LAST; i++) {
+ entry = &dev_cntrs[i];
+ if (entry->flags & CNTR_VL) {
+ for (vl = 0; vl < C_VL_COUNT; vl++)
+ read_dev_cntr(dd, i, vl);
+ } else {
+ read_dev_cntr(dd, i, CNTR_INVALID_VL);
+ }
+ }
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ for (j = 0; j < PORT_CNTR_LAST; j++) {
+ entry = &port_cntrs[j];
+ if (entry->flags & CNTR_VL) {
+ for (vl = 0; vl < C_VL_COUNT; vl++)
+ read_port_cntr(ppd, j, vl);
+ } else {
+ read_port_cntr(ppd, j, CNTR_INVALID_VL);
+ }
+ }
+ }
+
+ /*
+ * We want the value in the register. The goal is to keep track
+ * of the number of "ticks" not the counter value. In other
+ * words if the register rolls we want to notice it and go ahead
+ * and force an update.
+ */
+ entry = &dev_cntrs[C_DC_XMIT_FLITS];
+ dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+ CNTR_MODE_R, 0);
+
+ entry = &dev_cntrs[C_DC_RCV_FLITS];
+ dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+ CNTR_MODE_R, 0);
+
+ hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
+ dd->unit, dd->last_tx, dd->last_rx);
+
+ } else {
+ hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
+ }
+
+ mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+}
+
+#define C_MAX_NAME 13 /* 12 chars + one for /0 */
+static int init_cntrs(struct hfi1_devdata *dd)
+{
+ int i, rcv_ctxts, j;
+ size_t sz;
+ char *p;
+ char name[C_MAX_NAME];
+ struct hfi1_pportdata *ppd;
+ const char *bit_type_32 = ",32";
+ const int bit_type_32_sz = strlen(bit_type_32);
+
+ /* set up the stats timer; the add_timer is done at the end */
+ setup_timer(&dd->synth_stats_timer, update_synth_timer,
+ (unsigned long)dd);
+
+ /***********************/
+ /* per device counters */
+ /***********************/
+
+ /* size names and determine how many we have*/
+ dd->ndevcntrs = 0;
+ sz = 0;
+
+ for (i = 0; i < DEV_CNTR_LAST; i++) {
+ if (dev_cntrs[i].flags & CNTR_DISABLED) {
+ hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
+ continue;
+ }
+
+ if (dev_cntrs[i].flags & CNTR_VL) {
+ dev_cntrs[i].offset = dd->ndevcntrs;
+ for (j = 0; j < C_VL_COUNT; j++) {
+ snprintf(name, C_MAX_NAME, "%s%d",
+ dev_cntrs[i].name, vl_from_idx(j));
+ sz += strlen(name);
+ /* Add ",32" for 32-bit counters */
+ if (dev_cntrs[i].flags & CNTR_32BIT)
+ sz += bit_type_32_sz;
+ sz++;
+ dd->ndevcntrs++;
+ }
+ } else if (dev_cntrs[i].flags & CNTR_SDMA) {
+ dev_cntrs[i].offset = dd->ndevcntrs;
+ for (j = 0; j < dd->chip_sdma_engines; j++) {
+ snprintf(name, C_MAX_NAME, "%s%d",
+ dev_cntrs[i].name, j);
+ sz += strlen(name);
+ /* Add ",32" for 32-bit counters */
+ if (dev_cntrs[i].flags & CNTR_32BIT)
+ sz += bit_type_32_sz;
+ sz++;
+ dd->ndevcntrs++;
+ }
+ } else {
+ /* +1 for newline. */
+ sz += strlen(dev_cntrs[i].name) + 1;
+ /* Add ",32" for 32-bit counters */
+ if (dev_cntrs[i].flags & CNTR_32BIT)
+ sz += bit_type_32_sz;
+ dev_cntrs[i].offset = dd->ndevcntrs;
+ dd->ndevcntrs++;
+ }
+ }
+
+ /* allocate space for the counter values */
+ dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
+ if (!dd->cntrs)
+ goto bail;
+
+ dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL);
+ if (!dd->scntrs)
+ goto bail;
+
+ /* allocate space for the counter names */
+ dd->cntrnameslen = sz;
+ dd->cntrnames = kmalloc(sz, GFP_KERNEL);
+ if (!dd->cntrnames)
+ goto bail;
+
+ /* fill in the names */
+ for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) {
+ if (dev_cntrs[i].flags & CNTR_DISABLED) {
+ /* Nothing */
+ } else if (dev_cntrs[i].flags & CNTR_VL) {
+ for (j = 0; j < C_VL_COUNT; j++) {
+ snprintf(name, C_MAX_NAME, "%s%d",
+ dev_cntrs[i].name,
+ vl_from_idx(j));
+ memcpy(p, name, strlen(name));
+ p += strlen(name);
+
+ /* Counter is 32 bits */
+ if (dev_cntrs[i].flags & CNTR_32BIT) {
+ memcpy(p, bit_type_32, bit_type_32_sz);
+ p += bit_type_32_sz;
+ }
+
+ *p++ = '\n';
+ }
+ } else if (dev_cntrs[i].flags & CNTR_SDMA) {
+ for (j = 0; j < dd->chip_sdma_engines; j++) {
+ snprintf(name, C_MAX_NAME, "%s%d",
+ dev_cntrs[i].name, j);
+ memcpy(p, name, strlen(name));
+ p += strlen(name);
+
+ /* Counter is 32 bits */
+ if (dev_cntrs[i].flags & CNTR_32BIT) {
+ memcpy(p, bit_type_32, bit_type_32_sz);
+ p += bit_type_32_sz;
+ }
+
+ *p++ = '\n';
+ }
+ } else {
+ memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name));
+ p += strlen(dev_cntrs[i].name);
+
+ /* Counter is 32 bits */
+ if (dev_cntrs[i].flags & CNTR_32BIT) {
+ memcpy(p, bit_type_32, bit_type_32_sz);
+ p += bit_type_32_sz;
+ }
+
+ *p++ = '\n';
+ }
+ }
+
+ /*********************/
+ /* per port counters */
+ /*********************/
+
+ /*
+ * Go through the counters for the overflows and disable the ones we
+ * don't need. This varies based on platform so we need to do it
+ * dynamically here.
+ */
+ rcv_ctxts = dd->num_rcv_contexts;
+ for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
+ i <= C_RCV_HDR_OVF_LAST; i++) {
+ port_cntrs[i].flags |= CNTR_DISABLED;
+ }
+
+ /* size port counter names and determine how many we have*/
+ sz = 0;
+ dd->nportcntrs = 0;
+ for (i = 0; i < PORT_CNTR_LAST; i++) {
+ if (port_cntrs[i].flags & CNTR_DISABLED) {
+ hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
+ continue;
+ }
+
+ if (port_cntrs[i].flags & CNTR_VL) {
+ port_cntrs[i].offset = dd->nportcntrs;
+ for (j = 0; j < C_VL_COUNT; j++) {
+ snprintf(name, C_MAX_NAME, "%s%d",
+ port_cntrs[i].name, vl_from_idx(j));
+ sz += strlen(name);
+ /* Add ",32" for 32-bit counters */
+ if (port_cntrs[i].flags & CNTR_32BIT)
+ sz += bit_type_32_sz;
+ sz++;
+ dd->nportcntrs++;
+ }
+ } else {
+ /* +1 for newline */
+ sz += strlen(port_cntrs[i].name) + 1;
+ /* Add ",32" for 32-bit counters */
+ if (port_cntrs[i].flags & CNTR_32BIT)
+ sz += bit_type_32_sz;
+ port_cntrs[i].offset = dd->nportcntrs;
+ dd->nportcntrs++;
+ }
+ }
+
+ /* allocate space for the counter names */
+ dd->portcntrnameslen = sz;
+ dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
+ if (!dd->portcntrnames)
+ goto bail;
+
+ /* fill in port cntr names */
+ for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
+ if (port_cntrs[i].flags & CNTR_DISABLED)
+ continue;
+
+ if (port_cntrs[i].flags & CNTR_VL) {
+ for (j = 0; j < C_VL_COUNT; j++) {
+ snprintf(name, C_MAX_NAME, "%s%d",
+ port_cntrs[i].name, vl_from_idx(j));
+ memcpy(p, name, strlen(name));
+ p += strlen(name);
+
+ /* Counter is 32 bits */
+ if (port_cntrs[i].flags & CNTR_32BIT) {
+ memcpy(p, bit_type_32, bit_type_32_sz);
+ p += bit_type_32_sz;
+ }
+
+ *p++ = '\n';
+ }
+ } else {
+ memcpy(p, port_cntrs[i].name,
+ strlen(port_cntrs[i].name));
+ p += strlen(port_cntrs[i].name);
+
+ /* Counter is 32 bits */
+ if (port_cntrs[i].flags & CNTR_32BIT) {
+ memcpy(p, bit_type_32, bit_type_32_sz);
+ p += bit_type_32_sz;
+ }
+
+ *p++ = '\n';
+ }
+ }
+
+ /* allocate per port storage for counter values */
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+ if (!ppd->cntrs)
+ goto bail;
+
+ ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+ if (!ppd->scntrs)
+ goto bail;
+ }
+
+ /* CPU counters need to be allocated and zeroed */
+ if (init_cpu_counters(dd))
+ goto bail;
+
+ mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+ return 0;
+bail:
+ free_cntrs(dd);
+ return -ENOMEM;
+}
+
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+ switch (chip_lstate) {
+ default:
+ dd_dev_err(dd,
+ "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+ chip_lstate);
+ /* fall through */
+ case LSTATE_DOWN:
+ return IB_PORT_DOWN;
+ case LSTATE_INIT:
+ return IB_PORT_INIT;
+ case LSTATE_ARMED:
+ return IB_PORT_ARMED;
+ case LSTATE_ACTIVE:
+ return IB_PORT_ACTIVE;
+ }
+}
+
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
+{
+ /* look at the HFI meta-states only */
+ switch (chip_pstate & 0xf0) {
+ default:
+ dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+ chip_pstate);
+ /* fall through */
+ case PLS_DISABLED:
+ return IB_PORTPHYSSTATE_DISABLED;
+ case PLS_OFFLINE:
+ return OPA_PORTPHYSSTATE_OFFLINE;
+ case PLS_POLLING:
+ return IB_PORTPHYSSTATE_POLLING;
+ case PLS_CONFIGPHY:
+ return IB_PORTPHYSSTATE_TRAINING;
+ case PLS_LINKUP:
+ return IB_PORTPHYSSTATE_LINKUP;
+ case PLS_PHYTEST:
+ return IB_PORTPHYSSTATE_PHY_TEST;
+ }
+}
+
+/* return the OPA port logical state name */
+const char *opa_lstate_name(u32 lstate)
+{
+ static const char * const port_logical_names[] = {
+ "PORT_NOP",
+ "PORT_DOWN",
+ "PORT_INIT",
+ "PORT_ARMED",
+ "PORT_ACTIVE",
+ "PORT_ACTIVE_DEFER",
+ };
+ if (lstate < ARRAY_SIZE(port_logical_names))
+ return port_logical_names[lstate];
+ return "unknown";
+}
+
+/* return the OPA port physical state name */
+const char *opa_pstate_name(u32 pstate)
+{
+ static const char * const port_physical_names[] = {
+ "PHYS_NOP",
+ "reserved1",
+ "PHYS_POLL",
+ "PHYS_DISABLED",
+ "PHYS_TRAINING",
+ "PHYS_LINKUP",
+ "PHYS_LINK_ERR_RECOVER",
+ "PHYS_PHY_TEST",
+ "reserved8",
+ "PHYS_OFFLINE",
+ "PHYS_GANGED",
+ "PHYS_TEST",
+ };
+ if (pstate < ARRAY_SIZE(port_physical_names))
+ return port_physical_names[pstate];
+ return "unknown";
+}
+
+/*
+ * Read the hardware link state and set the driver's cached value of it.
+ * Return the (new) current value.
+ */
+u32 get_logical_state(struct hfi1_pportdata *ppd)
+{
+ u32 new_state;
+
+ new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
+ if (new_state != ppd->lstate) {
+ dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
+ opa_lstate_name(new_state), new_state);
+ ppd->lstate = new_state;
+ }
+ /*
+ * Set port status flags in the page mapped into userspace
+ * memory. Do it here to ensure a reliable state - this is
+ * the only function called by all state handling code.
+ * Always set the flags due to the fact that the cache value
+ * might have been changed explicitly outside of this
+ * function.
+ */
+ if (ppd->statusp) {
+ switch (ppd->lstate) {
+ case IB_PORT_DOWN:
+ case IB_PORT_INIT:
+ *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+ HFI1_STATUS_IB_READY);
+ break;
+ case IB_PORT_ARMED:
+ *ppd->statusp |= HFI1_STATUS_IB_CONF;
+ break;
+ case IB_PORT_ACTIVE:
+ *ppd->statusp |= HFI1_STATUS_IB_READY;
+ break;
+ }
+ }
+ return ppd->lstate;
+}
+
+/**
+ * wait_logical_linkstate - wait for an IB link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for IB link state change to occur.
+ * For now, take the easy polling route.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+ int msecs)
+{
+ unsigned long timeout;
+
+ timeout = jiffies + msecs_to_jiffies(msecs);
+ while (1) {
+ if (get_logical_state(ppd) == state)
+ return 0;
+ if (time_after(jiffies, timeout))
+ break;
+ msleep(20);
+ }
+ dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
+
+ return -ETIMEDOUT;
+}
+
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
+{
+ u32 pstate;
+ u32 ib_pstate;
+
+ pstate = read_physical_state(ppd->dd);
+ ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
+ if (ppd->last_pstate != ib_pstate) {
+ dd_dev_info(ppd->dd,
+ "%s: physical state changed to %s (0x%x), phy 0x%x\n",
+ __func__, opa_pstate_name(ib_pstate), ib_pstate,
+ pstate);
+ ppd->last_pstate = ib_pstate;
+ }
+ return ib_pstate;
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+int hfi1_init_ctxt(struct send_context *sc)
+{
+ if (sc) {
+ struct hfi1_devdata *dd = sc->dd;
+ u64 reg;
+ u8 set = (sc->type == SC_USER ?
+ HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
+ HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
+ reg = read_kctxt_csr(dd, sc->hw_context,
+ SEND_CTXT_CHECK_ENABLE);
+ if (set)
+ CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+ else
+ SET_STATIC_RATE_CONTROL_SMASK(reg);
+ write_kctxt_csr(dd, sc->hw_context,
+ SEND_CTXT_CHECK_ENABLE, reg);
+ }
+ return 0;
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
+{
+ int ret = 0;
+ u64 reg;
+
+ if (dd->icode != ICODE_RTL_SILICON) {
+ if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+ dd_dev_info(dd, "%s: tempsense not supported by HW\n",
+ __func__);
+ return -EINVAL;
+ }
+ reg = read_csr(dd, ASIC_STS_THERM);
+ temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
+ ASIC_STS_THERM_CURR_TEMP_MASK);
+ temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
+ ASIC_STS_THERM_LO_TEMP_MASK);
+ temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
+ ASIC_STS_THERM_HI_TEMP_MASK);
+ temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
+ ASIC_STS_THERM_CRIT_TEMP_MASK);
+ /* triggers is a 3-bit value - 1 bit per trigger. */
+ temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
+
+ return ret;
+}
+
+/* ========================================================================= */
+
+/*
+ * Enable/disable chip from delivering interrupts.
+ */
+void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+{
+ int i;
+
+ /*
+ * In HFI, the mask needs to be 1 to allow interrupts.
+ */
+ if (enable) {
+ /* enable all interrupts */
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ write_csr(dd, CCE_INT_MASK + (8 * i), ~(u64)0);
+
+ init_qsfp_int(dd);
+ } else {
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+ }
+}
+
+/*
+ * Clear all interrupt sources on the chip.
+ */
+static void clear_all_interrupts(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0);
+
+ write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
+ write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
+ for (i = 0; i < dd->chip_send_contexts; i++)
+ write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
+ for (i = 0; i < dd->chip_sdma_engines; i++)
+ write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
+
+ write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
+ write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
+ write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
+}
+
+/* Move to pcie.c? */
+static void disable_intx(struct pci_dev *pdev)
+{
+ pci_intx(pdev, 0);
+}
+
+static void clean_up_interrupts(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* remove irqs - must happen before disabling/turning off */
+ if (dd->num_msix_entries) {
+ /* MSI-X */
+ struct hfi1_msix_entry *me = dd->msix_entries;
+
+ for (i = 0; i < dd->num_msix_entries; i++, me++) {
+ if (!me->arg) /* => no irq, no affinity */
+ continue;
+ hfi1_put_irq_affinity(dd, &dd->msix_entries[i]);
+ free_irq(me->msix.vector, me->arg);
+ }
+ } else {
+ /* INTx */
+ if (dd->requested_intx_irq) {
+ free_irq(dd->pcidev->irq, dd);
+ dd->requested_intx_irq = 0;
+ }
+ }
+
+ /* turn off interrupts */
+ if (dd->num_msix_entries) {
+ /* MSI-X */
+ pci_disable_msix(dd->pcidev);
+ } else {
+ /* INTx */
+ disable_intx(dd->pcidev);
+ }
+
+ /* clean structures */
+ kfree(dd->msix_entries);
+ dd->msix_entries = NULL;
+ dd->num_msix_entries = 0;
+}
+
+/*
+ * Remap the interrupt source from the general handler to the given MSI-X
+ * interrupt.
+ */
+static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+{
+ u64 reg;
+ int m, n;
+
+ /* clear from the handled mask of the general interrupt */
+ m = isrc / 64;
+ n = isrc % 64;
+ dd->gi_mask[m] &= ~((u64)1 << n);
+
+ /* direct the chip source to the given MSI-X interrupt */
+ m = isrc / 8;
+ n = isrc % 8;
+ reg = read_csr(dd, CCE_INT_MAP + (8 * m));
+ reg &= ~((u64)0xff << (8 * n));
+ reg |= ((u64)msix_intr & 0xff) << (8 * n);
+ write_csr(dd, CCE_INT_MAP + (8 * m), reg);
+}
+
+static void remap_sdma_interrupts(struct hfi1_devdata *dd,
+ int engine, int msix_intr)
+{
+ /*
+ * SDMA engine interrupt sources grouped by type, rather than
+ * engine. Per-engine interrupts are as follows:
+ * SDMA
+ * SDMAProgress
+ * SDMAIdle
+ */
+ remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
+ msix_intr);
+ remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
+ msix_intr);
+ remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
+ msix_intr);
+}
+
+static int request_intx_irq(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME "_%d",
+ dd->unit);
+ ret = request_irq(dd->pcidev->irq, general_interrupt,
+ IRQF_SHARED, dd->intx_name, dd);
+ if (ret)
+ dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
+ ret);
+ else
+ dd->requested_intx_irq = 1;
+ return ret;
+}
+
+static int request_msix_irqs(struct hfi1_devdata *dd)
+{
+ int first_general, last_general;
+ int first_sdma, last_sdma;
+ int first_rx, last_rx;
+ int i, ret = 0;
+
+ /* calculate the ranges we are going to use */
+ first_general = 0;
+ last_general = first_general + 1;
+ first_sdma = last_general;
+ last_sdma = first_sdma + dd->num_sdma;
+ first_rx = last_sdma;
+ last_rx = first_rx + dd->n_krcv_queues;
+
+ /*
+ * Sanity check - the code expects all SDMA chip source
+ * interrupts to be in the same CSR, starting at bit 0. Verify
+ * that this is true by checking the bit location of the start.
+ */
+ BUILD_BUG_ON(IS_SDMA_START % 64);
+
+ for (i = 0; i < dd->num_msix_entries; i++) {
+ struct hfi1_msix_entry *me = &dd->msix_entries[i];
+ const char *err_info;
+ irq_handler_t handler;
+ irq_handler_t thread = NULL;
+ void *arg;
+ int idx;
+ struct hfi1_ctxtdata *rcd = NULL;
+ struct sdma_engine *sde = NULL;
+
+ /* obtain the arguments to request_irq */
+ if (first_general <= i && i < last_general) {
+ idx = i - first_general;
+ handler = general_interrupt;
+ arg = dd;
+ snprintf(me->name, sizeof(me->name),
+ DRIVER_NAME "_%d", dd->unit);
+ err_info = "general";
+ me->type = IRQ_GENERAL;
+ } else if (first_sdma <= i && i < last_sdma) {
+ idx = i - first_sdma;
+ sde = &dd->per_sdma[idx];
+ handler = sdma_interrupt;
+ arg = sde;
+ snprintf(me->name, sizeof(me->name),
+ DRIVER_NAME "_%d sdma%d", dd->unit, idx);
+ err_info = "sdma";
+ remap_sdma_interrupts(dd, idx, i);
+ me->type = IRQ_SDMA;
+ } else if (first_rx <= i && i < last_rx) {
+ idx = i - first_rx;
+ rcd = dd->rcd[idx];
+ /* no interrupt if no rcd */
+ if (!rcd)
+ continue;
+ /*
+ * Set the interrupt register and mask for this
+ * context's interrupt.
+ */
+ rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
+ rcd->imask = ((u64)1) <<
+ ((IS_RCVAVAIL_START + idx) % 64);
+ handler = receive_context_interrupt;
+ thread = receive_context_thread;
+ arg = rcd;
+ snprintf(me->name, sizeof(me->name),
+ DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
+ err_info = "receive context";
+ remap_intr(dd, IS_RCVAVAIL_START + idx, i);
+ me->type = IRQ_RCVCTXT;
+ } else {
+ /* not in our expected range - complain, then
+ * ignore it
+ */
+ dd_dev_err(dd,
+ "Unexpected extra MSI-X interrupt %d\n", i);
+ continue;
+ }
+ /* no argument, no interrupt */
+ if (!arg)
+ continue;
+ /* make sure the name is terminated */
+ me->name[sizeof(me->name) - 1] = 0;
+
+ ret = request_threaded_irq(me->msix.vector, handler, thread, 0,
+ me->name, arg);
+ if (ret) {
+ dd_dev_err(dd,
+ "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
+ err_info, me->msix.vector, idx, ret);
+ return ret;
+ }
+ /*
+ * assign arg after request_irq call, so it will be
+ * cleaned up
+ */
+ me->arg = arg;
+
+ ret = hfi1_get_irq_affinity(dd, me);
+ if (ret)
+ dd_dev_err(dd,
+ "unable to pin IRQ %d\n", ret);
+ }
+
+ return ret;
+}
+
+/*
+ * Set the general handler to accept all interrupts, remap all
+ * chip interrupts back to MSI-X 0.
+ */
+static void reset_interrupts(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* all interrupts handled by the general handler */
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ dd->gi_mask[i] = ~(u64)0;
+
+ /* all chip interrupts map to MSI-X 0 */
+ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+ write_csr(dd, CCE_INT_MAP + (8 * i), 0);
+}
+
+static int set_up_interrupts(struct hfi1_devdata *dd)
+{
+ struct hfi1_msix_entry *entries;
+ u32 total, request;
+ int i, ret;
+ int single_interrupt = 0; /* we expect to have all the interrupts */
+
+ /*
+ * Interrupt count:
+ * 1 general, "slow path" interrupt (includes the SDMA engines
+ * slow source, SDMACleanupDone)
+ * N interrupts - one per used SDMA engine
+ * M interrupt - one per kernel receive context
+ */
+ total = 1 + dd->num_sdma + dd->n_krcv_queues;
+
+ entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
+ if (!entries) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ /* 1-1 MSI-X entry assignment */
+ for (i = 0; i < total; i++)
+ entries[i].msix.entry = i;
+
+ /* ask for MSI-X interrupts */
+ request = total;
+ request_msix(dd, &request, entries);
+
+ if (request == 0) {
+ /* using INTx */
+ /* dd->num_msix_entries already zero */
+ kfree(entries);
+ single_interrupt = 1;
+ dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
+ } else {
+ /* using MSI-X */
+ dd->num_msix_entries = request;
+ dd->msix_entries = entries;
+
+ if (request != total) {
+ /* using MSI-X, with reduced interrupts */
+ dd_dev_err(
+ dd,
+ "cannot handle reduced interrupt case, want %u, got %u\n",
+ total, request);
+ ret = -EINVAL;
+ goto fail;
+ }
+ dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+ }
+
+ /* mask all interrupts */
+ set_intr_state(dd, 0);
+ /* clear all pending interrupts */
+ clear_all_interrupts(dd);
+
+ /* reset general handler mask, chip MSI-X mappings */
+ reset_interrupts(dd);
+
+ if (single_interrupt)
+ ret = request_intx_irq(dd);
+ else
+ ret = request_msix_irqs(dd);
+ if (ret)
+ goto fail;
+
+ return 0;
+
+fail:
+ clean_up_interrupts(dd);
+ return ret;
+}
+
+/*
+ * Set up context values in dd. Sets:
+ *
+ * num_rcv_contexts - number of contexts being used
+ * n_krcv_queues - number of kernel contexts
+ * first_user_ctxt - first non-kernel context in array of contexts
+ * freectxts - number of free user contexts
+ * num_send_contexts - number of PIO send contexts being used
+ */
+static int set_up_context_variables(struct hfi1_devdata *dd)
+{
+ unsigned long num_kernel_contexts;
+ int total_contexts;
+ int ret;
+ unsigned ngroups;
+ int qos_rmt_count;
+ int user_rmt_reduced;
+
+ /*
+ * Kernel receive contexts:
+ * - Context 0 - control context (VL15/multicast/error)
+ * - Context 1 - first kernel context
+ * - Context 2 - second kernel context
+ * ...
+ */
+ if (n_krcvqs)
+ /*
+ * n_krcvqs is the sum of module parameter kernel receive
+ * contexts, krcvqs[]. It does not include the control
+ * context, so add that.
+ */
+ num_kernel_contexts = n_krcvqs + 1;
+ else
+ num_kernel_contexts = DEFAULT_KRCVQS + 1;
+ /*
+ * Every kernel receive context needs an ACK send context.
+ * one send context is allocated for each VL{0-7} and VL15
+ */
+ if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
+ dd_dev_err(dd,
+ "Reducing # kernel rcv contexts to: %d, from %lu\n",
+ (int)(dd->chip_send_contexts - num_vls - 1),
+ num_kernel_contexts);
+ num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
+ }
+ /*
+ * User contexts:
+ * - default to 1 user context per real (non-HT) CPU core if
+ * num_user_contexts is negative
+ */
+ if (num_user_contexts < 0)
+ num_user_contexts =
+ cpumask_weight(&node_affinity.real_cpu_mask);
+
+ total_contexts = num_kernel_contexts + num_user_contexts;
+
+ /*
+ * Adjust the counts given a global max.
+ */
+ if (total_contexts > dd->chip_rcv_contexts) {
+ dd_dev_err(dd,
+ "Reducing # user receive contexts to: %d, from %d\n",
+ (int)(dd->chip_rcv_contexts - num_kernel_contexts),
+ (int)num_user_contexts);
+ num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
+ /* recalculate */
+ total_contexts = num_kernel_contexts + num_user_contexts;
+ }
+
+ /* each user context requires an entry in the RMT */
+ qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+ if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
+ user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+ dd_dev_err(dd,
+ "RMT size is reducing the number of user receive contexts from %d to %d\n",
+ (int)num_user_contexts,
+ user_rmt_reduced);
+ /* recalculate */
+ num_user_contexts = user_rmt_reduced;
+ total_contexts = num_kernel_contexts + num_user_contexts;
+ }
+
+ /* the first N are kernel contexts, the rest are user contexts */
+ dd->num_rcv_contexts = total_contexts;
+ dd->n_krcv_queues = num_kernel_contexts;
+ dd->first_user_ctxt = num_kernel_contexts;
+ dd->num_user_contexts = num_user_contexts;
+ dd->freectxts = num_user_contexts;
+ dd_dev_info(dd,
+ "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
+ (int)dd->chip_rcv_contexts,
+ (int)dd->num_rcv_contexts,
+ (int)dd->n_krcv_queues,
+ (int)dd->num_rcv_contexts - dd->n_krcv_queues);
+
+ /*
+ * Receive array allocation:
+ * All RcvArray entries are divided into groups of 8. This
+ * is required by the hardware and will speed up writes to
+ * consecutive entries by using write-combining of the entire
+ * cacheline.
+ *
+ * The number of groups are evenly divided among all contexts.
+ * any left over groups will be given to the first N user
+ * contexts.
+ */
+ dd->rcv_entries.group_size = RCV_INCREMENT;
+ ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
+ dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
+ dd->rcv_entries.nctxt_extra = ngroups -
+ (dd->num_rcv_contexts * dd->rcv_entries.ngroups);
+ dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
+ dd->rcv_entries.ngroups,
+ dd->rcv_entries.nctxt_extra);
+ if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
+ MAX_EAGER_ENTRIES * 2) {
+ dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
+ dd->rcv_entries.group_size;
+ dd_dev_info(dd,
+ "RcvArray group count too high, change to %u\n",
+ dd->rcv_entries.ngroups);
+ dd->rcv_entries.nctxt_extra = 0;
+ }
+ /*
+ * PIO send contexts
+ */
+ ret = init_sc_pools_and_sizes(dd);
+ if (ret >= 0) { /* success */
+ dd->num_send_contexts = ret;
+ dd_dev_info(
+ dd,
+ "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
+ dd->chip_send_contexts,
+ dd->num_send_contexts,
+ dd->sc_sizes[SC_KERNEL].count,
+ dd->sc_sizes[SC_ACK].count,
+ dd->sc_sizes[SC_USER].count,
+ dd->sc_sizes[SC_VL15].count);
+ ret = 0; /* success */
+ }
+
+ return ret;
+}
+
+/*
+ * Set the device/port partition key table. The MAD code
+ * will ensure that, at least, the partial management
+ * partition key is present in the table.
+ */
+static void set_partition_keys(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 reg = 0;
+ int i;
+
+ dd_dev_info(dd, "Setting partition keys\n");
+ for (i = 0; i < hfi1_get_npkeys(dd); i++) {
+ reg |= (ppd->pkeys[i] &
+ RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
+ ((i % 4) *
+ RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
+ /* Each register holds 4 PKey values. */
+ if ((i % 4) == 3) {
+ write_csr(dd, RCV_PARTITION_KEY +
+ ((i - 3) * 2), reg);
+ reg = 0;
+ }
+ }
+
+ /* Always enable HW pkeys check when pkeys table is set */
+ add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
+}
+
+/*
+ * These CSRs and memories are uninitialized on reset and must be
+ * written before reading to set the ECC/parity bits.
+ *
+ * NOTE: All user context CSRs that are not mmaped write-only
+ * (e.g. the TID flows) must be initialized even if the driver never
+ * reads them.
+ */
+static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
+{
+ int i, j;
+
+ /* CceIntMap */
+ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+ write_csr(dd, CCE_INT_MAP + (8 * i), 0);
+
+ /* SendCtxtCreditReturnAddr */
+ for (i = 0; i < dd->chip_send_contexts; i++)
+ write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+
+ /* PIO Send buffers */
+ /* SDMA Send buffers */
+ /*
+ * These are not normally read, and (presently) have no method
+ * to be read, so are not pre-initialized
+ */
+
+ /* RcvHdrAddr */
+ /* RcvHdrTailAddr */
+ /* RcvTidFlowTable */
+ for (i = 0; i < dd->chip_rcv_contexts; i++) {
+ write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+ for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
+ write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0);
+ }
+
+ /* RcvArray */
+ for (i = 0; i < dd->chip_rcv_array_count; i++)
+ write_csr(dd, RCV_ARRAY + (8 * i),
+ RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+
+ /* RcvQPMapTable */
+ for (i = 0; i < 32; i++)
+ write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+}
+
+/*
+ * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
+ */
+static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
+ u64 ctrl_bits)
+{
+ unsigned long timeout;
+ u64 reg;
+
+ /* is the condition present? */
+ reg = read_csr(dd, CCE_STATUS);
+ if ((reg & status_bits) == 0)
+ return;
+
+ /* clear the condition */
+ write_csr(dd, CCE_CTRL, ctrl_bits);
+
+ /* wait for the condition to clear */
+ timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
+ while (1) {
+ reg = read_csr(dd, CCE_STATUS);
+ if ((reg & status_bits) == 0)
+ return;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(dd,
+ "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
+ status_bits, reg & status_bits);
+ return;
+ }
+ udelay(1);
+ }
+}
+
+/* set CCE CSRs to chip reset defaults */
+static void reset_cce_csrs(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* CCE_REVISION read-only */
+ /* CCE_REVISION2 read-only */
+ /* CCE_CTRL - bits clear automatically */
+ /* CCE_STATUS read-only, use CceCtrl to clear */
+ clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
+ clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
+ clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
+ for (i = 0; i < CCE_NUM_SCRATCH; i++)
+ write_csr(dd, CCE_SCRATCH + (8 * i), 0);
+ /* CCE_ERR_STATUS read-only */
+ write_csr(dd, CCE_ERR_MASK, 0);
+ write_csr(dd, CCE_ERR_CLEAR, ~0ull);
+ /* CCE_ERR_FORCE leave alone */
+ for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
+ write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
+ write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
+ /* CCE_PCIE_CTRL leave alone */
+ for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
+ write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
+ write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
+ CCE_MSIX_TABLE_UPPER_RESETCSR);
+ }
+ for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
+ /* CCE_MSIX_PBA read-only */
+ write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
+ write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
+ }
+ for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+ write_csr(dd, CCE_INT_MAP, 0);
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+ /* CCE_INT_STATUS read-only */
+ write_csr(dd, CCE_INT_MASK + (8 * i), 0);
+ write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
+ /* CCE_INT_FORCE leave alone */
+ /* CCE_INT_BLOCKED read-only */
+ }
+ for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
+ write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
+}
+
+/* set MISC CSRs to chip reset defaults */
+static void reset_misc_csrs(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < 32; i++) {
+ write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
+ write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
+ write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
+ }
+ /*
+ * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
+ * only be written 128-byte chunks
+ */
+ /* init RSA engine to clear lingering errors */
+ write_csr(dd, MISC_CFG_RSA_CMD, 1);
+ write_csr(dd, MISC_CFG_RSA_MU, 0);
+ write_csr(dd, MISC_CFG_FW_CTRL, 0);
+ /* MISC_STS_8051_DIGEST read-only */
+ /* MISC_STS_SBM_DIGEST read-only */
+ /* MISC_STS_PCIE_DIGEST read-only */
+ /* MISC_STS_FAB_DIGEST read-only */
+ /* MISC_ERR_STATUS read-only */
+ write_csr(dd, MISC_ERR_MASK, 0);
+ write_csr(dd, MISC_ERR_CLEAR, ~0ull);
+ /* MISC_ERR_FORCE leave alone */
+}
+
+/* set TXE CSRs to chip reset defaults */
+static void reset_txe_csrs(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /*
+ * TXE Kernel CSRs
+ */
+ write_csr(dd, SEND_CTRL, 0);
+ __cm_reset(dd, 0); /* reset CM internal state */
+ /* SEND_CONTEXTS read-only */
+ /* SEND_DMA_ENGINES read-only */
+ /* SEND_PIO_MEM_SIZE read-only */
+ /* SEND_DMA_MEM_SIZE read-only */
+ write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
+ pio_reset_all(dd); /* SEND_PIO_INIT_CTXT */
+ /* SEND_PIO_ERR_STATUS read-only */
+ write_csr(dd, SEND_PIO_ERR_MASK, 0);
+ write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
+ /* SEND_PIO_ERR_FORCE leave alone */
+ /* SEND_DMA_ERR_STATUS read-only */
+ write_csr(dd, SEND_DMA_ERR_MASK, 0);
+ write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
+ /* SEND_DMA_ERR_FORCE leave alone */
+ /* SEND_EGRESS_ERR_STATUS read-only */
+ write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
+ write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
+ /* SEND_EGRESS_ERR_FORCE leave alone */
+ write_csr(dd, SEND_BTH_QP, 0);
+ write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
+ write_csr(dd, SEND_SC2VLT0, 0);
+ write_csr(dd, SEND_SC2VLT1, 0);
+ write_csr(dd, SEND_SC2VLT2, 0);
+ write_csr(dd, SEND_SC2VLT3, 0);
+ write_csr(dd, SEND_LEN_CHECK0, 0);
+ write_csr(dd, SEND_LEN_CHECK1, 0);
+ /* SEND_ERR_STATUS read-only */
+ write_csr(dd, SEND_ERR_MASK, 0);
+ write_csr(dd, SEND_ERR_CLEAR, ~0ull);
+ /* SEND_ERR_FORCE read-only */
+ for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
+ write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0);
+ for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
+ write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0);
+ for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++)
+ write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0);
+ for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
+ write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0);
+ for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
+ write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0);
+ write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
+ write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR);
+ /* SEND_CM_CREDIT_USED_STATUS read-only */
+ write_csr(dd, SEND_CM_TIMER_CTRL, 0);
+ write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
+ write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
+ write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
+ write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
+ for (i = 0; i < TXE_NUM_DATA_VL; i++)
+ write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0);
+ write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+ /* SEND_CM_CREDIT_USED_VL read-only */
+ /* SEND_CM_CREDIT_USED_VL15 read-only */
+ /* SEND_EGRESS_CTXT_STATUS read-only */
+ /* SEND_EGRESS_SEND_DMA_STATUS read-only */
+ write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
+ /* SEND_EGRESS_ERR_INFO read-only */
+ /* SEND_EGRESS_ERR_SOURCE read-only */
+
+ /*
+ * TXE Per-Context CSRs
+ */
+ for (i = 0; i < dd->chip_send_contexts; i++) {
+ write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
+ write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
+ }
+
+ /*
+ * TXE Per-SDMA CSRs
+ */
+ for (i = 0; i < dd->chip_sdma_engines; i++) {
+ write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+ /* SEND_DMA_STATUS read-only */
+ write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
+ /* SEND_DMA_HEAD read-only */
+ write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
+ /* SEND_DMA_IDLE_CNT read-only */
+ write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
+ /* SEND_DMA_DESC_FETCHED_CNT read-only */
+ /* SEND_DMA_ENG_ERR_STATUS read-only */
+ write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
+ /* SEND_DMA_ENG_ERR_FORCE leave alone */
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
+ write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
+ }
+}
+
+/*
+ * Expect on entry:
+ * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
+ */
+static void init_rbufs(struct hfi1_devdata *dd)
+{
+ u64 reg;
+ int count;
+
+ /*
+ * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
+ * clear.
+ */
+ count = 0;
+ while (1) {
+ reg = read_csr(dd, RCV_STATUS);
+ if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
+ | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
+ break;
+ /*
+ * Give up after 1ms - maximum wait time.
+ *
+ * RBuf size is 148KiB. Slowest possible is PCIe Gen1 x1 at
+ * 250MB/s bandwidth. Lower rate to 66% for overhead to get:
+ * 148 KB / (66% * 250MB/s) = 920us
+ */
+ if (count++ > 500) {
+ dd_dev_err(dd,
+ "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
+ __func__, reg);
+ break;
+ }
+ udelay(2); /* do not busy-wait the CSR */
+ }
+
+ /* start the init - expect RcvCtrl to be 0 */
+ write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
+
+ /*
+ * Read to force the write of Rcvtrl.RxRbufInit. There is a brief
+ * period after the write before RcvStatus.RxRbufInitDone is valid.
+ * The delay in the first run through the loop below is sufficient and
+ * required before the first read of RcvStatus.RxRbufInintDone.
+ */
+ read_csr(dd, RCV_CTRL);
+
+ /* wait for the init to finish */
+ count = 0;
+ while (1) {
+ /* delay is required first time through - see above */
+ udelay(2); /* do not busy-wait the CSR */
+ reg = read_csr(dd, RCV_STATUS);
+ if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
+ break;
+
+ /* give up after 100us - slowest possible at 33MHz is 73us */
+ if (count++ > 50) {
+ dd_dev_err(dd,
+ "%s: RcvStatus.RxRbufInit not set, continuing\n",
+ __func__);
+ break;
+ }
+ }
+}
+
+/* set RXE CSRs to chip reset defaults */
+static void reset_rxe_csrs(struct hfi1_devdata *dd)
+{
+ int i, j;
+
+ /*
+ * RXE Kernel CSRs
+ */
+ write_csr(dd, RCV_CTRL, 0);
+ init_rbufs(dd);
+ /* RCV_STATUS read-only */
+ /* RCV_CONTEXTS read-only */
+ /* RCV_ARRAY_CNT read-only */
+ /* RCV_BUF_SIZE read-only */
+ write_csr(dd, RCV_BTH_QP, 0);
+ write_csr(dd, RCV_MULTICAST, 0);
+ write_csr(dd, RCV_BYPASS, 0);
+ write_csr(dd, RCV_VL15, 0);
+ /* this is a clear-down */
+ write_csr(dd, RCV_ERR_INFO,
+ RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+ /* RCV_ERR_STATUS read-only */
+ write_csr(dd, RCV_ERR_MASK, 0);
+ write_csr(dd, RCV_ERR_CLEAR, ~0ull);
+ /* RCV_ERR_FORCE leave alone */
+ for (i = 0; i < 32; i++)
+ write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+ for (i = 0; i < 4; i++)
+ write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
+ for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
+ write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
+ for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
+ write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
+ for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
+ write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
+ write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
+ write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
+ }
+ for (i = 0; i < 32; i++)
+ write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
+
+ /*
+ * RXE Kernel and User Per-Context CSRs
+ */
+ for (i = 0; i < dd->chip_rcv_contexts; i++) {
+ /* kernel */
+ write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
+ /* RCV_CTXT_STATUS read-only */
+ write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
+ write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
+ write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+ write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
+ write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
+
+ /* user */
+ /* RCV_HDR_TAIL read-only */
+ write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
+ /* RCV_EGR_INDEX_TAIL read-only */
+ write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
+ /* RCV_EGR_OFFSET_TAIL read-only */
+ for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
+ write_uctxt_csr(dd, i,
+ RCV_TID_FLOW_TABLE + (8 * j), 0);
+ }
+ }
+}
+
+/*
+ * Set sc2vl tables.
+ *
+ * They power on to zeros, so to avoid send context errors
+ * they need to be set:
+ *
+ * SC 0-7 -> VL 0-7 (respectively)
+ * SC 15 -> VL 15
+ * otherwise
+ * -> VL 0
+ */
+static void init_sc2vl_tables(struct hfi1_devdata *dd)
+{
+ int i;
+ /* init per architecture spec, constrained by hardware capability */
+
+ /* HFI maps sent packets */
+ write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
+ 0,
+ 0, 0, 1, 1,
+ 2, 2, 3, 3,
+ 4, 4, 5, 5,
+ 6, 6, 7, 7));
+ write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
+ 1,
+ 8, 0, 9, 0,
+ 10, 0, 11, 0,
+ 12, 0, 13, 0,
+ 14, 0, 15, 15));
+ write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
+ 2,
+ 16, 0, 17, 0,
+ 18, 0, 19, 0,
+ 20, 0, 21, 0,
+ 22, 0, 23, 0));
+ write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
+ 3,
+ 24, 0, 25, 0,
+ 26, 0, 27, 0,
+ 28, 0, 29, 0,
+ 30, 0, 31, 0));
+
+ /* DC maps received packets */
+ write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
+ 15_0,
+ 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
+ 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
+ write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
+ 31_16,
+ 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
+ 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
+
+ /* initialize the cached sc2vl values consistently with h/w */
+ for (i = 0; i < 32; i++) {
+ if (i < 8 || i == 15)
+ *((u8 *)(dd->sc2vl) + i) = (u8)i;
+ else
+ *((u8 *)(dd->sc2vl) + i) = 0;
+ }
+}
+
+/*
+ * Read chip sizes and then reset parts to sane, disabled, values. We cannot
+ * depend on the chip going through a power-on reset - a driver may be loaded
+ * and unloaded many times.
+ *
+ * Do not write any CSR values to the chip in this routine - there may be
+ * a reset following the (possible) FLR in this routine.
+ *
+ */
+static void init_chip(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /*
+ * Put the HFI CSRs in a known state.
+ * Combine this with a DC reset.
+ *
+ * Stop the device from doing anything while we do a
+ * reset. We know there are no other active users of
+ * the device since we are now in charge. Turn off
+ * off all outbound and inbound traffic and make sure
+ * the device does not generate any interrupts.
+ */
+
+ /* disable send contexts and SDMA engines */
+ write_csr(dd, SEND_CTRL, 0);
+ for (i = 0; i < dd->chip_send_contexts; i++)
+ write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+ for (i = 0; i < dd->chip_sdma_engines; i++)
+ write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+ /* disable port (turn off RXE inbound traffic) and contexts */
+ write_csr(dd, RCV_CTRL, 0);
+ for (i = 0; i < dd->chip_rcv_contexts; i++)
+ write_csr(dd, RCV_CTXT_CTRL, 0);
+ /* mask all interrupt sources */
+ for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+ write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+
+ /*
+ * DC Reset: do a full DC reset before the register clear.
+ * A recommended length of time to hold is one CSR read,
+ * so reread the CceDcCtrl. Then, hold the DC in reset
+ * across the clear.
+ */
+ write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+ (void)read_csr(dd, CCE_DC_CTRL);
+
+ if (use_flr) {
+ /*
+ * A FLR will reset the SPC core and part of the PCIe.
+ * The parts that need to be restored have already been
+ * saved.
+ */
+ dd_dev_info(dd, "Resetting CSRs with FLR\n");
+
+ /* do the FLR, the DC reset will remain */
+ hfi1_pcie_flr(dd);
+
+ /* restore command and BARs */
+ restore_pci_variables(dd);
+
+ if (is_ax(dd)) {
+ dd_dev_info(dd, "Resetting CSRs with FLR\n");
+ hfi1_pcie_flr(dd);
+ restore_pci_variables(dd);
+ }
+ } else {
+ dd_dev_info(dd, "Resetting CSRs with writes\n");
+ reset_cce_csrs(dd);
+ reset_txe_csrs(dd);
+ reset_rxe_csrs(dd);
+ reset_misc_csrs(dd);
+ }
+ /* clear the DC reset */
+ write_csr(dd, CCE_DC_CTRL, 0);
+
+ /* Set the LED off */
+ setextled(dd, 0);
+
+ /*
+ * Clear the QSFP reset.
+ * An FLR enforces a 0 on all out pins. The driver does not touch
+ * ASIC_QSFPn_OUT otherwise. This leaves RESET_N low and
+ * anything plugged constantly in reset, if it pays attention
+ * to RESET_N.
+ * Prime examples of this are optical cables. Set all pins high.
+ * I2CCLK and I2CDAT will change per direction, and INT_N and
+ * MODPRS_N are input only and their value is ignored.
+ */
+ write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
+ write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
+ init_chip_resources(dd);
+}
+
+static void init_early_variables(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* assign link credit variables */
+ dd->vau = CM_VAU;
+ dd->link_credits = CM_GLOBAL_CREDITS;
+ if (is_ax(dd))
+ dd->link_credits--;
+ dd->vcu = cu_to_vcu(hfi1_cu);
+ /* enough room for 8 MAD packets plus header - 17K */
+ dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
+ if (dd->vl15_init > dd->link_credits)
+ dd->vl15_init = dd->link_credits;
+
+ write_uninitialized_csrs_and_memories(dd);
+
+ if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+ for (i = 0; i < dd->num_pports; i++) {
+ struct hfi1_pportdata *ppd = &dd->pport[i];
+
+ set_partition_keys(ppd);
+ }
+ init_sc2vl_tables(dd);
+}
+
+static void init_kdeth_qp(struct hfi1_devdata *dd)
+{
+ /* user changed the KDETH_QP */
+ if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
+ /* out of range or illegal value */
+ dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
+ kdeth_qp = 0;
+ }
+ if (kdeth_qp == 0) /* not set, or failed range check */
+ kdeth_qp = DEFAULT_KDETH_QP;
+
+ write_csr(dd, SEND_BTH_QP,
+ (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) <<
+ SEND_BTH_QP_KDETH_QP_SHIFT);
+
+ write_csr(dd, RCV_BTH_QP,
+ (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) <<
+ RCV_BTH_QP_KDETH_QP_SHIFT);
+}
+
+/**
+ * init_qpmap_table
+ * @dd - device data
+ * @first_ctxt - first context
+ * @last_ctxt - first context
+ *
+ * This return sets the qpn mapping table that
+ * is indexed by qpn[8:1].
+ *
+ * The routine will round robin the 256 settings
+ * from first_ctxt to last_ctxt.
+ *
+ * The first/last looks ahead to having specialized
+ * receive contexts for mgmt and bypass. Normal
+ * verbs traffic will assumed to be on a range
+ * of receive contexts.
+ */
+static void init_qpmap_table(struct hfi1_devdata *dd,
+ u32 first_ctxt,
+ u32 last_ctxt)
+{
+ u64 reg = 0;
+ u64 regno = RCV_QP_MAP_TABLE;
+ int i;
+ u64 ctxt = first_ctxt;
+
+ for (i = 0; i < 256; i++) {
+ reg |= ctxt << (8 * (i % 8));
+ ctxt++;
+ if (ctxt > last_ctxt)
+ ctxt = first_ctxt;
+ if (i % 8 == 7) {
+ write_csr(dd, regno, reg);
+ reg = 0;
+ regno += 8;
+ }
+ }
+
+ add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
+ | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
+}
+
+struct rsm_map_table {
+ u64 map[NUM_MAP_REGS];
+ unsigned int used;
+};
+
+struct rsm_rule_data {
+ u8 offset;
+ u8 pkt_type;
+ u32 field1_off;
+ u32 field2_off;
+ u32 index1_off;
+ u32 index1_width;
+ u32 index2_off;
+ u32 index2_width;
+ u32 mask1;
+ u32 value1;
+ u32 mask2;
+ u32 value2;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in. OK if it
+ * returns NULL, indicating no table.
+ */
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
+{
+ struct rsm_map_table *rmt;
+ u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */
+
+ rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+ if (rmt) {
+ memset(rmt->map, rxcontext, sizeof(rmt->map));
+ rmt->used = 0;
+ }
+
+ return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table. OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+ struct rsm_map_table *rmt)
+{
+ int i;
+
+ if (rmt) {
+ /* write table to chip */
+ for (i = 0; i < NUM_MAP_REGS; i++)
+ write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+ /* enable RSM */
+ add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+ }
+}
+
+/*
+ * Add a receive side mapping rule.
+ */
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+ struct rsm_rule_data *rrd)
+{
+ write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+ (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+ 1ull << rule_index | /* enable bit */
+ (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+ write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+ (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+ (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+ (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+ (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+ (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+ (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+ write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+ (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+ (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+ (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+ (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+ unsigned int *np)
+{
+ int i;
+ unsigned int m, n;
+ u8 max_by_vl = 0;
+
+ /* is QOS active at all? */
+ if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+ num_vls == 1 ||
+ krcvqsset <= 1)
+ goto no_qos;
+
+ /* determine bits for qpn */
+ for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
+ if (krcvqs[i] > max_by_vl)
+ max_by_vl = krcvqs[i];
+ if (max_by_vl > 32)
+ goto no_qos;
+ m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+ /* determine bits for vl */
+ n = ilog2(__roundup_pow_of_two(num_vls));
+
+ /* reject if too much is used */
+ if ((m + n) > 7)
+ goto no_qos;
+
+ if (mp)
+ *mp = m;
+ if (np)
+ *np = n;
+
+ return 1 << (m + n);
+
+no_qos:
+ if (mp)
+ *mp = 0;
+ if (np)
+ *np = 0;
+ return 0;
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @rmt - RSM map table
+ *
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
+ *
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
+ */
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
+{
+ struct rsm_rule_data rrd;
+ unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+ unsigned int rmt_entries;
+ u64 reg;
+
+ if (!rmt)
+ goto bail;
+ rmt_entries = qos_rmt_entries(dd, &m, &n);
+ if (rmt_entries == 0)
+ goto bail;
+ qpns_per_vl = 1 << m;
+
+ /* enough room in the map table? */
+ rmt_entries = 1 << (m + n);
+ if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
+ goto bail;
+
+ /* add qos entries to the the RSM map table */
+ for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
+ unsigned tctxt;
+
+ for (qpn = 0, tctxt = ctxt;
+ krcvqs[i] && qpn < qpns_per_vl; qpn++) {
+ unsigned idx, regoff, regidx;
+
+ /* generate the index the hardware will produce */
+ idx = rmt->used + ((qpn << n) ^ i);
+ regoff = (idx % 8) * 8;
+ regidx = idx / 8;
+ /* replace default with context number */
+ reg = rmt->map[regidx];
+ reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
+ << regoff);
+ reg |= (u64)(tctxt++) << regoff;
+ rmt->map[regidx] = reg;
+ if (tctxt == ctxt + krcvqs[i])
+ tctxt = ctxt;
+ }
+ ctxt += krcvqs[i];
+ }
+
+ rrd.offset = rmt->used;
+ rrd.pkt_type = 2;
+ rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+ rrd.field2_off = LRH_SC_MATCH_OFFSET;
+ rrd.index1_off = LRH_SC_SELECT_OFFSET;
+ rrd.index1_width = n;
+ rrd.index2_off = QPN_SELECT_OFFSET;
+ rrd.index2_width = m + n;
+ rrd.mask1 = LRH_BTH_MASK;
+ rrd.value1 = LRH_BTH_VALUE;
+ rrd.mask2 = LRH_SC_MASK;
+ rrd.value2 = LRH_SC_VALUE;
+
+ /* add rule 0 */
+ add_rsm_rule(dd, 0, &rrd);
+
+ /* mark RSM map entries as used */
+ rmt->used += rmt_entries;
+ /* map everything else to the mcast/err/vl15 context */
+ init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
+ dd->qos_shift = n + 1;
+ return;
+bail:
+ dd->qos_shift = 1;
+ init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
+}
+
+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+ struct rsm_map_table *rmt)
+{
+ struct rsm_rule_data rrd;
+ u64 reg;
+ int i, idx, regoff, regidx;
+ u8 offset;
+
+ /* there needs to be enough room in the map table */
+ if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+ dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+ return;
+ }
+
+ /*
+ * RSM will extract the destination context as an index into the
+ * map table. The destination contexts are a sequential block
+ * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+ * Map entries are accessed as offset + extracted value. Adjust
+ * the added offset so this sequence can be placed anywhere in
+ * the table - as long as the entries themselves do not wrap.
+ * There are only enough bits in offset for the table size, so
+ * start with that to allow for a "negative" offset.
+ */
+ offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+ (int)dd->first_user_ctxt);
+
+ for (i = dd->first_user_ctxt, idx = rmt->used;
+ i < dd->num_rcv_contexts; i++, idx++) {
+ /* replace with identity mapping */
+ regoff = (idx % 8) * 8;
+ regidx = idx / 8;
+ reg = rmt->map[regidx];
+ reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+ reg |= (u64)i << regoff;
+ rmt->map[regidx] = reg;
+ }
+
+ /*
+ * For RSM intercept of Expected FECN packets:
+ * o packet type 0 - expected
+ * o match on F (bit 95), using select/match 1, and
+ * o match on SH (bit 133), using select/match 2.
+ *
+ * Use index 1 to extract the 8-bit receive context from DestQP
+ * (start at bit 64). Use that as the RSM map table index.
+ */
+ rrd.offset = offset;
+ rrd.pkt_type = 0;
+ rrd.field1_off = 95;
+ rrd.field2_off = 133;
+ rrd.index1_off = 64;
+ rrd.index1_width = 8;
+ rrd.index2_off = 0;
+ rrd.index2_width = 0;
+ rrd.mask1 = 1;
+ rrd.value1 = 1;
+ rrd.mask2 = 1;
+ rrd.value2 = 1;
+
+ /* add rule 1 */
+ add_rsm_rule(dd, 1, &rrd);
+
+ rmt->used += dd->num_user_contexts;
+}
+
+static void init_rxe(struct hfi1_devdata *dd)
+{
+ struct rsm_map_table *rmt;
+
+ /* enable all receive errors */
+ write_csr(dd, RCV_ERR_MASK, ~0ull);
+
+ rmt = alloc_rsm_map_table(dd);
+ /* set up QOS, including the QPN map table */
+ init_qos(dd, rmt);
+ init_user_fecn_handling(dd, rmt);
+ complete_rsm_map_table(dd, rmt);
+ kfree(rmt);
+
+ /*
+ * make sure RcvCtrl.RcvWcb <= PCIe Device Control
+ * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
+ * space, PciCfgCap2.MaxPayloadSize in HFI). There is only one
+ * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
+ * Max_PayLoad_Size set to its minimum of 128.
+ *
+ * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
+ * (64 bytes). Max_Payload_Size is possibly modified upward in
+ * tune_pcie_caps() which is called after this routine.
+ */
+}
+
+static void init_other(struct hfi1_devdata *dd)
+{
+ /* enable all CCE errors */
+ write_csr(dd, CCE_ERR_MASK, ~0ull);
+ /* enable *some* Misc errors */
+ write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
+ /* enable all DC errors, except LCB */
+ write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
+ write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
+}
+
+/*
+ * Fill out the given AU table using the given CU. A CU is defined in terms
+ * AUs. The table is a an encoding: given the index, how many AUs does that
+ * represent?
+ *
+ * NOTE: Assumes that the register layout is the same for the
+ * local and remote tables.
+ */
+static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
+ u32 csr0to3, u32 csr4to7)
+{
+ write_csr(dd, csr0to3,
+ 0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT |
+ 1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT |
+ 2ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT |
+ 4ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
+ write_csr(dd, csr4to7,
+ 8ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT |
+ 16ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT |
+ 32ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT |
+ 64ull * cu <<
+ SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
+}
+
+static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+ assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
+ SEND_CM_LOCAL_AU_TABLE4_TO7);
+}
+
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+ assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
+ SEND_CM_REMOTE_AU_TABLE4_TO7);
+}
+
+static void init_txe(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* enable all PIO, SDMA, general, and Egress errors */
+ write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
+ write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
+ write_csr(dd, SEND_ERR_MASK, ~0ull);
+ write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
+
+ /* enable all per-context and per-SDMA engine errors */
+ for (i = 0; i < dd->chip_send_contexts; i++)
+ write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
+ for (i = 0; i < dd->chip_sdma_engines; i++)
+ write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
+
+ /* set the local CU to AU mapping */
+ assign_local_cm_au_table(dd, dd->vcu);
+
+ /*
+ * Set reasonable default for Credit Return Timer
+ * Don't set on Simulator - causes it to choke.
+ */
+ if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+ write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
+}
+
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
+{
+ struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+ unsigned sctxt;
+ int ret = 0;
+ u64 reg;
+
+ if (!rcd || !rcd->sc) {
+ ret = -EINVAL;
+ goto done;
+ }
+ sctxt = rcd->sc->hw_context;
+ reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
+ ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
+ SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
+ /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
+ if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
+ reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
+ /*
+ * Enable send-side J_KEY integrity check, unless this is A0 h/w
+ */
+ if (!is_ax(dd)) {
+ reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+ reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+ }
+
+ /* Enable J_KEY check on receive context. */
+ reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
+ ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
+ RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
+ write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
+done:
+ return ret;
+}
+
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+ struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+ unsigned sctxt;
+ int ret = 0;
+ u64 reg;
+
+ if (!rcd || !rcd->sc) {
+ ret = -EINVAL;
+ goto done;
+ }
+ sctxt = rcd->sc->hw_context;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
+ /*
+ * Disable send-side J_KEY integrity check, unless this is A0 h/w.
+ * This check would not have been enabled for A0 h/w, see
+ * set_ctxt_jkey().
+ */
+ if (!is_ax(dd)) {
+ reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+ reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+ }
+ /* Turn off the J_KEY on the receive side */
+ write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
+done:
+ return ret;
+}
+
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
+{
+ struct hfi1_ctxtdata *rcd;
+ unsigned sctxt;
+ int ret = 0;
+ u64 reg;
+
+ if (ctxt < dd->num_rcv_contexts) {
+ rcd = dd->rcd[ctxt];
+ } else {
+ ret = -EINVAL;
+ goto done;
+ }
+ if (!rcd || !rcd->sc) {
+ ret = -EINVAL;
+ goto done;
+ }
+ sctxt = rcd->sc->hw_context;
+ reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
+ SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
+ reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+ reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+ reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+done:
+ return ret;
+}
+
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+ struct hfi1_ctxtdata *rcd;
+ unsigned sctxt;
+ int ret = 0;
+ u64 reg;
+
+ if (ctxt < dd->num_rcv_contexts) {
+ rcd = dd->rcd[ctxt];
+ } else {
+ ret = -EINVAL;
+ goto done;
+ }
+ if (!rcd || !rcd->sc) {
+ ret = -EINVAL;
+ goto done;
+ }
+ sctxt = rcd->sc->hw_context;
+ reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+ reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+ write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+done:
+ return ret;
+}
+
+/*
+ * Start doing the clean up the the chip. Our clean up happens in multiple
+ * stages and this is just the first.
+ */
+void hfi1_start_cleanup(struct hfi1_devdata *dd)
+{
+ aspm_exit(dd);
+ free_cntrs(dd);
+ free_rcverr(dd);
+ clean_up_interrupts(dd);
+ finish_chip_resources(dd);
+}
+
+#define HFI_BASE_GUID(dev) \
+ ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
+
+/*
+ * Information can be shared between the two HFIs on the same ASIC
+ * in the same OS. This function finds the peer device and sets
+ * up a shared structure.
+ */
+static int init_asic_data(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+ struct hfi1_devdata *tmp, *peer = NULL;
+ struct hfi1_asic_data *asic_data;
+ int ret = 0;
+
+ /* pre-allocate the asic structure in case we are the first device */
+ asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL);
+ if (!asic_data)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ /* Find our peer device */
+ list_for_each_entry(tmp, &hfi1_dev_list, list) {
+ if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
+ dd->unit != tmp->unit) {
+ peer = tmp;
+ break;
+ }
+ }
+
+ if (peer) {
+ /* use already allocated structure */
+ dd->asic_data = peer->asic_data;
+ kfree(asic_data);
+ } else {
+ dd->asic_data = asic_data;
+ mutex_init(&dd->asic_data->asic_resource_mutex);
+ }
+ dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+ /* first one through - set up i2c devices */
+ if (!peer)
+ ret = set_up_i2c(dd, dd->asic_data);
+
+ return ret;
+}
+
+/*
+ * Set dd->boardname. Use a generic name if a name is not returned from
+ * EFI variable space.
+ *
+ * Return 0 on success, -ENOMEM if space could not be allocated.
+ */
+static int obtain_boardname(struct hfi1_devdata *dd)
+{
+ /* generic board description */
+ const char generic[] =
+ "Intel Omni-Path Host Fabric Interface Adapter 100 Series";
+ unsigned long size;
+ int ret;
+
+ ret = read_hfi1_efi_var(dd, "description", &size,
+ (void **)&dd->boardname);
+ if (ret) {
+ dd_dev_info(dd, "Board description not found\n");
+ /* use generic description */
+ dd->boardname = kstrdup(generic, GFP_KERNEL);
+ if (!dd->boardname)
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/*
+ * Check the interrupt registers to make sure that they are mapped correctly.
+ * It is intended to help user identify any mismapping by VMM when the driver
+ * is running in a VM. This function should only be called before interrupt
+ * is set up properly.
+ *
+ * Return 0 on success, -EINVAL on failure.
+ */
+static int check_int_registers(struct hfi1_devdata *dd)
+{
+ u64 reg;
+ u64 all_bits = ~(u64)0;
+ u64 mask;
+
+ /* Clear CceIntMask[0] to avoid raising any interrupts */
+ mask = read_csr(dd, CCE_INT_MASK);
+ write_csr(dd, CCE_INT_MASK, 0ull);
+ reg = read_csr(dd, CCE_INT_MASK);
+ if (reg)
+ goto err_exit;
+
+ /* Clear all interrupt status bits */
+ write_csr(dd, CCE_INT_CLEAR, all_bits);
+ reg = read_csr(dd, CCE_INT_STATUS);
+ if (reg)
+ goto err_exit;
+
+ /* Set all interrupt status bits */
+ write_csr(dd, CCE_INT_FORCE, all_bits);
+ reg = read_csr(dd, CCE_INT_STATUS);
+ if (reg != all_bits)
+ goto err_exit;
+
+ /* Restore the interrupt mask */
+ write_csr(dd, CCE_INT_CLEAR, all_bits);
+ write_csr(dd, CCE_INT_MASK, mask);
+
+ return 0;
+err_exit:
+ write_csr(dd, CCE_INT_MASK, mask);
+ dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n");
+ return -EINVAL;
+}
+
+/**
+ * Allocate and initialize the device structure for the hfi.
+ * @dev: the pci_dev for hfi1_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, initializes, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ u64 reg;
+ int i, ret;
+ static const char * const inames[] = { /* implementation names */
+ "RTL silicon",
+ "RTL VCS simulation",
+ "RTL FPGA emulation",
+ "Functional simulator"
+ };
+ struct pci_dev *parent = pdev->bus->self;
+
+ dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
+ sizeof(struct hfi1_pportdata));
+ if (IS_ERR(dd))
+ goto bail;
+ ppd = dd->pport;
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ int vl;
+ /* init common fields */
+ hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
+ /* DC supports 4 link widths */
+ ppd->link_width_supported =
+ OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
+ OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
+ ppd->link_width_downgrade_supported =
+ ppd->link_width_supported;
+ /* start out enabling only 4X */
+ ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
+ ppd->link_width_downgrade_enabled =
+ ppd->link_width_downgrade_supported;
+ /* link width active is 0 when link is down */
+ /* link width downgrade active is 0 when link is down */
+
+ if (num_vls < HFI1_MIN_VLS_SUPPORTED ||
+ num_vls > HFI1_MAX_VLS_SUPPORTED) {
+ hfi1_early_err(&pdev->dev,
+ "Invalid num_vls %u, using %u VLs\n",
+ num_vls, HFI1_MAX_VLS_SUPPORTED);
+ num_vls = HFI1_MAX_VLS_SUPPORTED;
+ }
+ ppd->vls_supported = num_vls;
+ ppd->vls_operational = ppd->vls_supported;
+ ppd->actual_vls_operational = ppd->vls_supported;
+ /* Set the default MTU. */
+ for (vl = 0; vl < num_vls; vl++)
+ dd->vld[vl].mtu = hfi1_max_mtu;
+ dd->vld[15].mtu = MAX_MAD_PACKET;
+ /*
+ * Set the initial values to reasonable default, will be set
+ * for real when link is up.
+ */
+ ppd->lstate = IB_PORT_DOWN;
+ ppd->overrun_threshold = 0x4;
+ ppd->phy_error_threshold = 0xf;
+ ppd->port_crc_mode_enabled = link_crc_mask;
+ /* initialize supported LTP CRC mode */
+ ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+ /* initialize enabled LTP CRC mode */
+ ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
+ /* start in offline */
+ ppd->host_link_state = HLS_DN_OFFLINE;
+ init_vl_arb_caches(ppd);
+ ppd->last_pstate = 0xff; /* invalid value */
+ }
+
+ dd->link_default = HLS_DN_POLL;
+
+ /*
+ * Do remaining PCIe setup and save PCIe values in dd.
+ * Any error printing is already done by the init code.
+ * On return, we have the chip mapped.
+ */
+ ret = hfi1_pcie_ddinit(dd, pdev, ent);
+ if (ret < 0)
+ goto bail_free;
+
+ /* verify that reads actually work, save revision for reset check */
+ dd->revision = read_csr(dd, CCE_REVISION);
+ if (dd->revision == ~(u64)0) {
+ dd_dev_err(dd, "cannot read chip CSRs\n");
+ ret = -EINVAL;
+ goto bail_cleanup;
+ }
+ dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
+ & CCE_REVISION_CHIP_REV_MAJOR_MASK;
+ dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+ & CCE_REVISION_CHIP_REV_MINOR_MASK;
+
+ /*
+ * Check interrupt registers mapping if the driver has no access to
+ * the upstream component. In this case, it is likely that the driver
+ * is running in a VM.
+ */
+ if (!parent) {
+ ret = check_int_registers(dd);
+ if (ret)
+ goto bail_cleanup;
+ }
+
+ /*
+ * obtain the hardware ID - NOT related to unit, which is a
+ * software enumeration
+ */
+ reg = read_csr(dd, CCE_REVISION2);
+ dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
+ & CCE_REVISION2_HFI_ID_MASK;
+ /* the variable size will remove unwanted bits */
+ dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
+ dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
+ dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
+ dd->icode < ARRAY_SIZE(inames) ?
+ inames[dd->icode] : "unknown", (int)dd->irev);
+
+ /* speeds the hardware can support */
+ dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
+ /* speeds allowed to run at */
+ dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
+ /* give a reasonable active value, will be set on link up */
+ dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
+
+ dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
+ dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
+ dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
+ dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
+ dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
+ /* fix up link widths for emulation _p */
+ ppd = dd->pport;
+ if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
+ ppd->link_width_supported =
+ ppd->link_width_enabled =
+ ppd->link_width_downgrade_supported =
+ ppd->link_width_downgrade_enabled =
+ OPA_LINK_WIDTH_1X;
+ }
+ /* insure num_vls isn't larger than number of sdma engines */
+ if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
+ dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
+ num_vls, dd->chip_sdma_engines);
+ num_vls = dd->chip_sdma_engines;
+ ppd->vls_supported = dd->chip_sdma_engines;
+ ppd->vls_operational = ppd->vls_supported;
+ }
+
+ /*
+ * Convert the ns parameter to the 64 * cclocks used in the CSR.
+ * Limit the max if larger than the field holds. If timeout is
+ * non-zero, then the calculated field will be at least 1.
+ *
+ * Must be after icode is set up - the cclock rate depends
+ * on knowing the hardware being used.
+ */
+ dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
+ if (dd->rcv_intr_timeout_csr >
+ RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
+ dd->rcv_intr_timeout_csr =
+ RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
+ else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
+ dd->rcv_intr_timeout_csr = 1;
+
+ /* needs to be done before we look for the peer device */
+ read_guid(dd);
+
+ /* set up shared ASIC data with peer device */
+ ret = init_asic_data(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* obtain chip sizes, reset chip CSRs */
+ init_chip(dd);
+
+ /* read in the PCIe link speed information */
+ ret = pcie_speeds(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* Needs to be called before hfi1_firmware_init */
+ get_platform_config(dd);
+
+ /* read in firmware */
+ ret = hfi1_firmware_init(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /*
+ * In general, the PCIe Gen3 transition must occur after the
+ * chip has been idled (so it won't initiate any PCIe transactions
+ * e.g. an interrupt) and before the driver changes any registers
+ * (the transition will reset the registers).
+ *
+ * In particular, place this call after:
+ * - init_chip() - the chip will not initiate any PCIe transactions
+ * - pcie_speeds() - reads the current link speed
+ * - hfi1_firmware_init() - the needed firmware is ready to be
+ * downloaded
+ */
+ ret = do_pcie_gen3_transition(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* start setting dd values and adjusting CSRs */
+ init_early_variables(dd);
+
+ parse_platform_config(dd);
+
+ ret = obtain_boardname(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ snprintf(dd->boardversion, BOARD_VERS_MAX,
+ "ChipABI %u.%u, ChipRev %u.%u, SW Compat %llu\n",
+ HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
+ (u32)dd->majrev,
+ (u32)dd->minrev,
+ (dd->revision >> CCE_REVISION_SW_SHIFT)
+ & CCE_REVISION_SW_MASK);
+
+ ret = set_up_context_variables(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* set initial RXE CSRs */
+ init_rxe(dd);
+ /* set initial TXE CSRs */
+ init_txe(dd);
+ /* set initial non-RXE, non-TXE CSRs */
+ init_other(dd);
+ /* set up KDETH QP prefix in both RX and TX CSRs */
+ init_kdeth_qp(dd);
+
+ ret = hfi1_dev_affinity_init(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* send contexts must be set up before receive contexts */
+ ret = init_send_contexts(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ ret = hfi1_create_ctxts(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
+ /*
+ * rcd[0] is guaranteed to be valid by this point. Also, all
+ * context are using the same value, as per the module parameter.
+ */
+ dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
+
+ ret = init_pervl_scs(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* sdma init */
+ for (i = 0; i < dd->num_pports; ++i) {
+ ret = sdma_init(dd, i);
+ if (ret)
+ goto bail_cleanup;
+ }
+
+ /* use contexts created by hfi1_create_ctxts */
+ ret = set_up_interrupts(dd);
+ if (ret)
+ goto bail_cleanup;
+
+ /* set up LCB access - must be after set_up_interrupts() */
+ init_lcb_access(dd);
+
+ /*
+ * Serial number is created from the base guid:
+ * [27:24] = base guid [38:35]
+ * [23: 0] = base guid [23: 0]
+ */
+ snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
+ (dd->base_guid & 0xFFFFFF) |
+ ((dd->base_guid >> 11) & 0xF000000));
+
+ dd->oui1 = dd->base_guid >> 56 & 0xFF;
+ dd->oui2 = dd->base_guid >> 48 & 0xFF;
+ dd->oui3 = dd->base_guid >> 40 & 0xFF;
+
+ ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
+ if (ret)
+ goto bail_clear_intr;
+
+ thermal_init(dd);
+
+ ret = init_cntrs(dd);
+ if (ret)
+ goto bail_clear_intr;
+
+ ret = init_rcverr(dd);
+ if (ret)
+ goto bail_free_cntrs;
+
+ ret = eprom_init(dd);
+ if (ret)
+ goto bail_free_rcverr;
+
+ goto bail;
+
+bail_free_rcverr:
+ free_rcverr(dd);
+bail_free_cntrs:
+ free_cntrs(dd);
+bail_clear_intr:
+ clean_up_interrupts(dd);
+bail_cleanup:
+ hfi1_pcie_ddcleanup(dd);
+bail_free:
+ hfi1_free_devdata(dd);
+ dd = ERR_PTR(ret);
+bail:
+ return dd;
+}
+
+static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
+ u32 dw_len)
+{
+ u32 delta_cycles;
+ u32 current_egress_rate = ppd->current_egress_rate;
+ /* rates here are in units of 10^6 bits/sec */
+
+ if (desired_egress_rate == -1)
+ return 0; /* shouldn't happen */
+
+ if (desired_egress_rate >= current_egress_rate)
+ return 0; /* we can't help go faster, only slower */
+
+ delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
+ egress_cycles(dw_len * 4, current_egress_rate);
+
+ return (u16)delta_cycles;
+}
+
+/**
+ * create_pbc - build a pbc for transmission
+ * @flags: special case flags or-ed in built pbc
+ * @srate: static rate
+ * @vl: vl
+ * @dwlen: dword length (header words + data words + pbc words)
+ *
+ * Create a PBC with the given flags, rate, VL, and length.
+ *
+ * NOTE: The PBC created will not insert any HCRC - all callers but one are
+ * for verbs, which does not use this PSM feature. The lone other caller
+ * is for the diagnostic interface which calls this if the user does not
+ * supply their own PBC.
+ */
+u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
+ u32 dw_len)
+{
+ u64 pbc, delay = 0;
+
+ if (unlikely(srate_mbs))
+ delay = delay_cycles(ppd, srate_mbs, dw_len);
+
+ pbc = flags
+ | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+ | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+ | (vl & PBC_VL_MASK) << PBC_VL_SHIFT
+ | (dw_len & PBC_LENGTH_DWS_MASK)
+ << PBC_LENGTH_DWS_SHIFT;
+
+ return pbc;
+}
+
+#define SBUS_THERMAL 0x4f
+#define SBUS_THERM_MONITOR_MODE 0x1
+
+#define THERM_FAILURE(dev, ret, reason) \
+ dd_dev_err((dd), \
+ "Thermal sensor initialization failed: %s (%d)\n", \
+ (reason), (ret))
+
+/*
+ * Initialize the thermal sensor.
+ *
+ * After initialization, enable polling of thermal sensor through
+ * SBus interface. In order for this to work, the SBus Master
+ * firmware has to be loaded due to the fact that the HW polling
+ * logic uses SBus interrupts, which are not supported with
+ * default firmware. Otherwise, no data will be returned through
+ * the ASIC_STS_THERM CSR.
+ */
+static int thermal_init(struct hfi1_devdata *dd)
+{
+ int ret = 0;
+
+ if (dd->icode != ICODE_RTL_SILICON ||
+ check_chip_resource(dd, CR_THERM_INIT, NULL))
+ return ret;
+
+ ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Acquire SBus");
+ return ret;
+ }
+
+ dd_dev_info(dd, "Initializing thermal sensor\n");
+ /* Disable polling of thermal readings */
+ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+ msleep(100);
+ /* Thermal Sensor Initialization */
+ /* Step 1: Reset the Thermal SBus Receiver */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+ RESET_SBUS_RECEIVER, 0);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Bus Reset");
+ goto done;
+ }
+ /* Step 2: Set Reset bit in Thermal block */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+ WRITE_SBUS_RECEIVER, 0x1);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Therm Block Reset");
+ goto done;
+ }
+ /* Step 3: Write clock divider value (100MHz -> 2MHz) */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
+ WRITE_SBUS_RECEIVER, 0x32);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Write Clock Div");
+ goto done;
+ }
+ /* Step 4: Select temperature mode */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
+ WRITE_SBUS_RECEIVER,
+ SBUS_THERM_MONITOR_MODE);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Write Mode Sel");
+ goto done;
+ }
+ /* Step 5: De-assert block reset and start conversion */
+ ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+ WRITE_SBUS_RECEIVER, 0x2);
+ if (ret) {
+ THERM_FAILURE(dd, ret, "Write Reset Deassert");
+ goto done;
+ }
+ /* Step 5.1: Wait for first conversion (21.5ms per spec) */
+ msleep(22);
+
+ /* Enable polling of thermal readings */
+ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+
+ /* Set initialized flag */
+ ret = acquire_chip_resource(dd, CR_THERM_INIT, 0);
+ if (ret)
+ THERM_FAILURE(dd, ret, "Unable to set thermal init flag");
+
+done:
+ release_chip_resource(dd, CR_SBUS);
+ return ret;
+}
+
+static void handle_temp_err(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd = &dd->pport[0];
+ /*
+ * Thermal Critical Interrupt
+ * Put the device into forced freeze mode, take link down to
+ * offline, and put DC into reset.
+ */
+ dd_dev_emerg(dd,
+ "Critical temperature reached! Forcing device into freeze mode!\n");
+ dd->flags |= HFI1_FORCED_FREEZE;
+ start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT);
+ /*
+ * Shut DC down as much and as quickly as possible.
+ *
+ * Step 1: Take the link down to OFFLINE. This will cause the
+ * 8051 to put the Serdes in reset. However, we don't want to
+ * go through the entire link state machine since we want to
+ * shutdown ASAP. Furthermore, this is not a graceful shutdown
+ * but rather an attempt to save the chip.
+ * Code below is almost the same as quiet_serdes() but avoids
+ * all the extra work and the sleeps.
+ */
+ ppd->driver_link_ready = 0;
+ ppd->link_enabled = 0;
+ set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) |
+ PLS_OFFLINE);
+ /*
+ * Step 2: Shutdown LCB and 8051
+ * After shutdown, do not restore DC_CFG_RESET value.
+ */
+ dc_shutdown(dd);
+}
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
new file mode 100644
index 000000000000..e29573769efc
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -0,0 +1,1372 @@
+#ifndef _CHIP_H
+#define _CHIP_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the defines that is specific to the HFI chip
+ */
+
+/* sizes */
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define NUM_INTERRUPT_SOURCES 768
+#define RXE_NUM_CONTEXTS 160
+#define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_NUM_DATA_VL 8
+#define TXE_NUM_CONTEXTS 160
+#define TXE_NUM_SDMA_ENGINES 16
+#define NUM_CONTEXTS_PER_SET 8
+#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
+#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
+#define VL_ARB_TABLE_SIZE 16
+#define TXE_NUM_32_BIT_COUNTER 7
+#define TXE_NUM_64_BIT_COUNTER 30
+#define TXE_NUM_DATA_VL 8
+#define TXE_PIO_SIZE (32 * 0x100000) /* 32 MB */
+#define PIO_BLOCK_SIZE 64 /* bytes */
+#define SDMA_BLOCK_SIZE 64 /* bytes */
+#define RCV_BUF_BLOCK_SIZE 64 /* bytes */
+#define PIO_CMASK 0x7ff /* counter mask for free and fill counters */
+#define MAX_EAGER_ENTRIES 2048 /* max receive eager entries */
+#define MAX_TID_PAIR_ENTRIES 1024 /* max receive expected pairs */
+/*
+ * Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
+ * at 64 bytes for all generation one devices
+ */
+#define CM_VAU 3
+/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
+#define CM_GLOBAL_CREDITS 0x940
+/* Number of PKey entries in the HW */
+#define MAX_PKEY_VALUES 16
+
+#include "chip_registers.h"
+
+#define RXE_PER_CONTEXT_USER (RXE + RXE_PER_CONTEXT_OFFSET)
+#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
+
+/* PBC flags */
+#define PBC_INTR BIT_ULL(31)
+#define PBC_DC_INFO_SHIFT (30)
+#define PBC_DC_INFO BIT_ULL(PBC_DC_INFO_SHIFT)
+#define PBC_TEST_EBP BIT_ULL(29)
+#define PBC_PACKET_BYPASS BIT_ULL(28)
+#define PBC_CREDIT_RETURN BIT_ULL(25)
+#define PBC_INSERT_BYPASS_ICRC BIT_ULL(24)
+#define PBC_TEST_BAD_ICRC BIT_ULL(23)
+#define PBC_FECN BIT_ULL(22)
+
+/* PbcInsertHcrc field settings */
+#define PBC_IHCRC_LKDETH 0x0 /* insert @ local KDETH offset */
+#define PBC_IHCRC_GKDETH 0x1 /* insert @ global KDETH offset */
+#define PBC_IHCRC_NONE 0x2 /* no HCRC inserted */
+
+/* PBC fields */
+#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
+#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
+#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
+ (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
+ PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+
+#define PBC_INSERT_HCRC_SHIFT 26
+#define PBC_INSERT_HCRC_MASK 0x3ull
+#define PBC_INSERT_HCRC_SMASK \
+ (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
+
+#define PBC_VL_SHIFT 12
+#define PBC_VL_MASK 0xfull
+#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
+
+#define PBC_LENGTH_DWS_SHIFT 0
+#define PBC_LENGTH_DWS_MASK 0xfffull
+#define PBC_LENGTH_DWS_SMASK \
+ (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
+
+/* Credit Return Fields */
+#define CR_COUNTER_SHIFT 0
+#define CR_COUNTER_MASK 0x7ffull
+#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
+
+#define CR_STATUS_SHIFT 11
+#define CR_STATUS_MASK 0x1ull
+#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
+#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
+ (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
+ CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
+ (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
+ CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
+#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
+ (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
+ CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
+ (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
+ CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
+
+/* interrupt source numbers */
+#define IS_GENERAL_ERR_START 0
+#define IS_SDMAENG_ERR_START 16
+#define IS_SENDCTXT_ERR_START 32
+#define IS_SDMA_START 192 /* includes SDmaProgress,SDmaIdle */
+#define IS_VARIOUS_START 240
+#define IS_DC_START 248
+#define IS_RCVAVAIL_START 256
+#define IS_RCVURGENT_START 416
+#define IS_SENDCREDIT_START 576
+#define IS_RESERVED_START 736
+#define IS_MAX_SOURCES 768
+
+/* derived interrupt source values */
+#define IS_GENERAL_ERR_END IS_SDMAENG_ERR_START
+#define IS_SDMAENG_ERR_END IS_SENDCTXT_ERR_START
+#define IS_SENDCTXT_ERR_END IS_SDMA_START
+#define IS_SDMA_END IS_VARIOUS_START
+#define IS_VARIOUS_END IS_DC_START
+#define IS_DC_END IS_RCVAVAIL_START
+#define IS_RCVAVAIL_END IS_RCVURGENT_START
+#define IS_RCVURGENT_END IS_SENDCREDIT_START
+#define IS_SENDCREDIT_END IS_RESERVED_START
+#define IS_RESERVED_END IS_MAX_SOURCES
+
+/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
+#define QSFP1_INT 242
+#define QSFP2_INT 243
+
+/* DCC_CFG_PORT_CONFIG logical link states */
+#define LSTATE_DOWN 0x1
+#define LSTATE_INIT 0x2
+#define LSTATE_ARMED 0x3
+#define LSTATE_ACTIVE 0x4
+
+/* DC8051_STS_CUR_STATE port values (physical link states) */
+#define PLS_DISABLED 0x30
+#define PLS_OFFLINE 0x90
+#define PLS_OFFLINE_QUIET 0x90
+#define PLS_OFFLINE_PLANNED_DOWN_INFORM 0x91
+#define PLS_OFFLINE_READY_TO_QUIET_LT 0x92
+#define PLS_OFFLINE_REPORT_FAILURE 0x93
+#define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94
+#define PLS_POLLING 0x20
+#define PLS_POLLING_QUIET 0x20
+#define PLS_POLLING_ACTIVE 0x21
+#define PLS_CONFIGPHY 0x40
+#define PLS_CONFIGPHY_DEBOUCE 0x40
+#define PLS_CONFIGPHY_ESTCOMM 0x41
+#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT 0x42
+#define PLS_CONFIGPHY_ESTCOMM_LOCAL_COMPLETE 0x43
+#define PLS_CONFIGPHY_OPTEQ 0x44
+#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING 0x44
+#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE 0x45
+#define PLS_CONFIGPHY_VERIFYCAP 0x46
+#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE 0x46
+#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
+#define PLS_CONFIGLT 0x48
+#define PLS_CONFIGLT_CONFIGURE 0x48
+#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE 0x49
+#define PLS_LINKUP 0x50
+#define PLS_PHYTEST 0xB0
+#define PLS_INTERNAL_SERDES_LOOPBACK 0xe1
+#define PLS_QUICK_LINKUP 0xe2
+
+/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
+#define HCMD_LOAD_CONFIG_DATA 0x01
+#define HCMD_READ_CONFIG_DATA 0x02
+#define HCMD_CHANGE_PHY_STATE 0x03
+#define HCMD_SEND_LCB_IDLE_MSG 0x04
+#define HCMD_MISC 0x05
+#define HCMD_READ_LCB_IDLE_MSG 0x06
+#define HCMD_READ_LCB_CSR 0x07
+#define HCMD_WRITE_LCB_CSR 0x08
+#define HCMD_INTERFACE_TEST 0xff
+
+/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
+#define HCMD_SUCCESS 2
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
+#define SPICO_ROM_FAILED BIT(0)
+#define UNKNOWN_FRAME BIT(1)
+#define TARGET_BER_NOT_MET BIT(2)
+#define FAILED_SERDES_INTERNAL_LOOPBACK BIT(3)
+#define FAILED_SERDES_INIT BIT(4)
+#define FAILED_LNI_POLLING BIT(5)
+#define FAILED_LNI_DEBOUNCE BIT(6)
+#define FAILED_LNI_ESTBCOMM BIT(7)
+#define FAILED_LNI_OPTEQ BIT(8)
+#define FAILED_LNI_VERIFY_CAP1 BIT(9)
+#define FAILED_LNI_VERIFY_CAP2 BIT(10)
+#define FAILED_LNI_CONFIGLT BIT(11)
+#define HOST_HANDSHAKE_TIMEOUT BIT(12)
+
+#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
+ | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
+ | FAILED_LNI_VERIFY_CAP1 \
+ | FAILED_LNI_VERIFY_CAP2 \
+ | FAILED_LNI_CONFIGLT | HOST_HANDSHAKE_TIMEOUT)
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
+#define HOST_REQ_DONE BIT(0)
+#define BC_PWR_MGM_MSG BIT(1)
+#define BC_SMA_MSG BIT(2)
+#define BC_BCC_UNKNOWN_MSG BIT(3)
+#define BC_IDLE_UNKNOWN_MSG BIT(4)
+#define EXT_DEVICE_CFG_REQ BIT(5)
+#define VERIFY_CAP_FRAME BIT(6)
+#define LINKUP_ACHIEVED BIT(7)
+#define LINK_GOING_DOWN BIT(8)
+#define LINK_WIDTH_DOWNGRADED BIT(9)
+
+/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
+#define HREQ_LOAD_CONFIG 0x01
+#define HREQ_SAVE_CONFIG 0x02
+#define HREQ_READ_CONFIG 0x03
+#define HREQ_SET_TX_EQ_ABS 0x04
+#define HREQ_SET_TX_EQ_REL 0x05
+#define HREQ_ENABLE 0x06
+#define HREQ_CONFIG_DONE 0xfe
+#define HREQ_INTERFACE_TEST 0xff
+
+/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
+#define HREQ_INVALID 0x01
+#define HREQ_SUCCESS 0x02
+#define HREQ_NOT_SUPPORTED 0x03
+#define HREQ_FEATURE_NOT_SUPPORTED 0x04 /* request specific feature */
+#define HREQ_REQUEST_REJECTED 0xfe
+#define HREQ_EXECUTION_ONGOING 0xff
+
+/* MISC host command functions */
+#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
+#define HCMD_MISC_GRANT_LCB_ACCESS 0x2
+
+/* idle flit message types */
+#define IDLE_PHYSICAL_LINK_MGMT 0x1
+#define IDLE_CRU 0x2
+#define IDLE_SMA 0x3
+#define IDLE_POWER_MGMT 0x4
+
+/* idle flit message send fields (both send and read) */
+#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
+#define IDLE_PAYLOAD_SHIFT 8
+#define IDLE_MSG_TYPE_MASK 0xf
+#define IDLE_MSG_TYPE_SHIFT 0
+
+/* idle flit message read fields */
+#define READ_IDLE_MSG_TYPE_MASK 0xf
+#define READ_IDLE_MSG_TYPE_SHIFT 0
+
+/* SMA idle flit payload commands */
+#define SMA_IDLE_ARM 1
+#define SMA_IDLE_ACTIVE 2
+
+/* DC_DC8051_CFG_MODE.GENERAL bits */
+#define DISABLE_SELF_GUID_CHECK 0x2
+
+/*
+ * Eager buffer minimum and maximum sizes supported by the hardware.
+ * All power-of-two sizes in between are supported as well.
+ * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
+ * allocatable for Eager buffer to a single context. All others
+ * are limits for the RcvArray entries.
+ */
+#define MIN_EAGER_BUFFER (4 * 1024)
+#define MAX_EAGER_BUFFER (256 * 1024)
+#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
+#define MAX_EXPECTED_BUFFER (2048 * 1024)
+
+/*
+ * Receive expected base and count and eager base and count increment -
+ * the CSR fields hold multiples of this value.
+ */
+#define RCV_SHIFT 3
+#define RCV_INCREMENT BIT(RCV_SHIFT)
+
+/*
+ * Receive header queue entry increment - the CSR holds multiples of
+ * this value.
+ */
+#define HDRQ_SIZE_SHIFT 5
+#define HDRQ_INCREMENT BIT(HDRQ_SIZE_SHIFT)
+
+/*
+ * Freeze handling flags
+ */
+#define FREEZE_ABORT 0x01 /* do not do recovery */
+#define FREEZE_SELF 0x02 /* initiate the freeze */
+#define FREEZE_LINK_DOWN 0x04 /* link is down */
+
+/*
+ * Chip implementation codes.
+ */
+#define ICODE_RTL_SILICON 0x00
+#define ICODE_RTL_VCS_SIMULATION 0x01
+#define ICODE_FPGA_EMULATION 0x02
+#define ICODE_FUNCTIONAL_SIMULATOR 0x03
+
+/*
+ * 8051 data memory size.
+ */
+#define DC8051_DATA_MEM_SIZE 0x1000
+
+/*
+ * 8051 firmware registers
+ */
+#define NUM_GENERAL_FIELDS 0x17
+#define NUM_LANE_FIELDS 0x8
+
+/* 8051 general register Field IDs */
+#define LINK_OPTIMIZATION_SETTINGS 0x00
+#define LINK_TUNING_PARAMETERS 0x02
+#define DC_HOST_COMM_SETTINGS 0x03
+#define TX_SETTINGS 0x06
+#define VERIFY_CAP_LOCAL_PHY 0x07
+#define VERIFY_CAP_LOCAL_FABRIC 0x08
+#define VERIFY_CAP_LOCAL_LINK_WIDTH 0x09
+#define LOCAL_DEVICE_ID 0x0a
+#define LOCAL_LNI_INFO 0x0c
+#define REMOTE_LNI_INFO 0x0d
+#define MISC_STATUS 0x0e
+#define VERIFY_CAP_REMOTE_PHY 0x0f
+#define VERIFY_CAP_REMOTE_FABRIC 0x10
+#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
+#define LAST_LOCAL_STATE_COMPLETE 0x12
+#define LAST_REMOTE_STATE_COMPLETE 0x13
+#define LINK_QUALITY_INFO 0x14
+#define REMOTE_DEVICE_ID 0x15
+#define LINK_DOWN_REASON 0x16
+
+/* 8051 lane specific register field IDs */
+#define TX_EQ_SETTINGS 0x00
+#define CHANNEL_LOSS_SETTINGS 0x05
+
+/* Lane ID for general configuration registers */
+#define GENERAL_CONFIG 4
+
+/* LINK_TUNING_PARAMETERS fields */
+#define TUNING_METHOD_SHIFT 24
+
+/* LINK_OPTIMIZATION_SETTINGS fields */
+#define ENABLE_EXT_DEV_CONFIG_SHIFT 24
+
+/* LOAD_DATA 8051 command shifts and fields */
+#define LOAD_DATA_FIELD_ID_SHIFT 40
+#define LOAD_DATA_FIELD_ID_MASK 0xfull
+#define LOAD_DATA_LANE_ID_SHIFT 32
+#define LOAD_DATA_LANE_ID_MASK 0xfull
+#define LOAD_DATA_DATA_SHIFT 0x0
+#define LOAD_DATA_DATA_MASK 0xffffffffull
+
+/* READ_DATA 8051 command shifts and fields */
+#define READ_DATA_FIELD_ID_SHIFT 40
+#define READ_DATA_FIELD_ID_MASK 0xffull
+#define READ_DATA_LANE_ID_SHIFT 32
+#define READ_DATA_LANE_ID_MASK 0xffull
+#define READ_DATA_DATA_SHIFT 0x0
+#define READ_DATA_DATA_MASK 0xffffffffull
+
+/* TX settings fields */
+#define ENABLE_LANE_TX_SHIFT 0
+#define ENABLE_LANE_TX_MASK 0xff
+#define TX_POLARITY_INVERSION_SHIFT 8
+#define TX_POLARITY_INVERSION_MASK 0xff
+#define RX_POLARITY_INVERSION_SHIFT 16
+#define RX_POLARITY_INVERSION_MASK 0xff
+#define MAX_RATE_SHIFT 24
+#define MAX_RATE_MASK 0xff
+
+/* verify capability PHY fields */
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK 0x1
+#define POWER_MANAGEMENT_SHIFT 0x0
+#define POWER_MANAGEMENT_MASK 0xf
+
+/* 8051 lane register Field IDs */
+#define SPICO_FW_VERSION 0x7 /* SPICO firmware version */
+
+/* SPICO firmware version fields */
+#define SPICO_ROM_VERSION_SHIFT 0
+#define SPICO_ROM_VERSION_MASK 0xffff
+#define SPICO_ROM_PROD_ID_SHIFT 16
+#define SPICO_ROM_PROD_ID_MASK 0xffff
+
+/* verify capability fabric fields */
+#define VAU_SHIFT 0
+#define VAU_MASK 0x0007
+#define Z_SHIFT 3
+#define Z_MASK 0x0001
+#define VCU_SHIFT 4
+#define VCU_MASK 0x0007
+#define VL15BUF_SHIFT 8
+#define VL15BUF_MASK 0x0fff
+#define CRC_SIZES_SHIFT 20
+#define CRC_SIZES_MASK 0x7
+
+/* verify capability local link width fields */
+#define LINK_WIDTH_SHIFT 0 /* also for remote link width */
+#define LINK_WIDTH_MASK 0xffff /* also for remote link width */
+#define LOCAL_FLAG_BITS_SHIFT 16
+#define LOCAL_FLAG_BITS_MASK 0xff
+#define MISC_CONFIG_BITS_SHIFT 24
+#define MISC_CONFIG_BITS_MASK 0xff
+
+/* verify capability remote link width fields */
+#define REMOTE_TX_RATE_SHIFT 16
+#define REMOTE_TX_RATE_MASK 0xff
+
+/* LOCAL_DEVICE_ID fields */
+#define LOCAL_DEVICE_REV_SHIFT 0
+#define LOCAL_DEVICE_REV_MASK 0xff
+#define LOCAL_DEVICE_ID_SHIFT 8
+#define LOCAL_DEVICE_ID_MASK 0xffff
+
+/* REMOTE_DEVICE_ID fields */
+#define REMOTE_DEVICE_REV_SHIFT 0
+#define REMOTE_DEVICE_REV_MASK 0xff
+#define REMOTE_DEVICE_ID_SHIFT 8
+#define REMOTE_DEVICE_ID_MASK 0xffff
+
+/* local LNI link width fields */
+#define ENABLE_LANE_RX_SHIFT 16
+#define ENABLE_LANE_RX_MASK 0xff
+
+/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
+#define MGMT_ALLOWED_SHIFT 23
+#define MGMT_ALLOWED_MASK 0x1
+
+/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
+#define LINK_QUALITY_SHIFT 24
+#define LINK_QUALITY_MASK 0x7
+
+/*
+ * mask, shift for reading 'planned_down_remote_reason_code'
+ * from LINK_QUALITY_INFO field
+ */
+#define DOWN_REMOTE_REASON_SHIFT 16
+#define DOWN_REMOTE_REASON_MASK 0xff
+
+/* verify capability PHY power management bits */
+#define PWRM_BER_CONTROL 0x1
+#define PWRM_BANDWIDTH_CONTROL 0x2
+
+/* 8051 link down reasons */
+#define LDR_LINK_TRANSFER_ACTIVE_LOW 0xa
+#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb
+#define LDR_RECEIVED_HOST_OFFLINE_REQ 0xc
+
+/* verify capability fabric CRC size bits */
+enum {
+ CAP_CRC_14B = (1 << 0), /* 14b CRC */
+ CAP_CRC_48B = (1 << 1), /* 48b CRC */
+ CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
+};
+
+#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
+
+/* misc status version fields */
+#define STS_FM_VERSION_A_SHIFT 16
+#define STS_FM_VERSION_A_MASK 0xff
+#define STS_FM_VERSION_B_SHIFT 24
+#define STS_FM_VERSION_B_MASK 0xff
+
+/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
+#define LCB_CRC_16B 0x0 /* 16b CRC */
+#define LCB_CRC_14B 0x1 /* 14b CRC */
+#define LCB_CRC_48B 0x2 /* 48b CRC */
+#define LCB_CRC_12B_16B_PER_LANE 0x3 /* 12b-16b per lane CRC */
+
+/*
+ * the following enum is (almost) a copy/paste of the definition
+ * in the OPA spec, section 20.2.2.6.8 (PortInfo)
+ */
+enum {
+ PORT_LTP_CRC_MODE_NONE = 0,
+ PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
+ PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
+ PORT_LTP_CRC_MODE_48 = 4,
+ /* 48-bit overlapping LTP CRC mode (optional) */
+ PORT_LTP_CRC_MODE_PER_LANE = 8
+ /* 12 to 16 bit per lane LTP CRC mode (optional) */
+};
+
+/* timeouts */
+#define LINK_RESTART_DELAY 1000 /* link restart delay, in ms */
+#define TIMEOUT_8051_START 5000 /* 8051 start timeout, in ms */
+#define DC8051_COMMAND_TIMEOUT 20000 /* DC8051 command timeout, in ms */
+#define FREEZE_STATUS_TIMEOUT 20 /* wait for freeze indicators, in ms */
+#define VL_STATUS_CLEAR_TIMEOUT 5000 /* per-VL status clear, in ms */
+#define CCE_STATUS_TIMEOUT 10 /* time to clear CCE Status, in ms */
+
+/* cclock tick time, in picoseconds per tick: 1/speed * 10^12 */
+#define ASIC_CCLOCK_PS 1242 /* 805 MHz */
+#define FPGA_CCLOCK_PS 30300 /* 33 MHz */
+
+/*
+ * Mask of enabled MISC errors. Do not enable the two RSA engine errors -
+ * see firmware.c:run_rsa() for details.
+ */
+#define DRIVER_MISC_MASK \
+ (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
+ | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
+
+/* valid values for the loopback module parameter */
+#define LOOPBACK_NONE 0 /* no loopback - default */
+#define LOOPBACK_SERDES 1
+#define LOOPBACK_LCB 2
+#define LOOPBACK_CABLE 3 /* external cable */
+
+/* read and write hardware registers */
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
+
+/*
+ * The *_kctxt_* flavor of the CSR read/write functions are for
+ * per-context or per-SDMA CSRs that are not mappable to user-space.
+ * Their spacing is not a PAGE_SIZE multiple.
+ */
+static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+ u32 offset0)
+{
+ /* kernel per-context CSRs are separated by 0x100 */
+ return read_csr(dd, offset0 + (0x100 * ctxt));
+}
+
+static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
+ u32 offset0, u64 value)
+{
+ /* kernel per-context CSRs are separated by 0x100 */
+ write_csr(dd, offset0 + (0x100 * ctxt), value);
+}
+
+int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
+int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
+
+void __iomem *get_csr_addr(
+ struct hfi1_devdata *dd,
+ u32 offset);
+
+static inline void __iomem *get_kctxt_csr_addr(
+ struct hfi1_devdata *dd,
+ int ctxt,
+ u32 offset0)
+{
+ return get_csr_addr(dd, offset0 + (0x100 * ctxt));
+}
+
+/*
+ * The *_uctxt_* flavor of the CSR read/write functions are for
+ * per-context CSRs that are mappable to user space. All these CSRs
+ * are spaced by a PAGE_SIZE multiple in order to be mappable to
+ * different processes without exposing other contexts' CSRs
+ */
+static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+ u32 offset0)
+{
+ /* user per-context CSRs are separated by 0x1000 */
+ return read_csr(dd, offset0 + (0x1000 * ctxt));
+}
+
+static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
+ u32 offset0, u64 value)
+{
+ /* user per-context CSRs are separated by 0x1000 */
+ write_csr(dd, offset0 + (0x1000 * ctxt), value);
+}
+
+u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32);
+
+/* firmware.c */
+#define SBUS_MASTER_BROADCAST 0xfd
+#define NUM_PCIE_SERDES 16 /* number of PCIe serdes on the SBus */
+extern const u8 pcie_serdes_broadcast[];
+extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
+extern uint platform_config_load;
+
+/* SBus commands */
+#define RESET_SBUS_RECEIVER 0x20
+#define WRITE_SBUS_RECEIVER 0x21
+#define READ_SBUS_RECEIVER 0x22
+void sbus_request(struct hfi1_devdata *dd,
+ u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+int sbus_request_slow(struct hfi1_devdata *dd,
+ u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+void set_sbus_fast_mode(struct hfi1_devdata *dd);
+void clear_sbus_fast_mode(struct hfi1_devdata *dd);
+int hfi1_firmware_init(struct hfi1_devdata *dd);
+int load_pcie_firmware(struct hfi1_devdata *dd);
+int load_firmware(struct hfi1_devdata *dd);
+void dispose_firmware(void);
+int acquire_hw_mutex(struct hfi1_devdata *dd);
+void release_hw_mutex(struct hfi1_devdata *dd);
+
+/*
+ * Bitmask of dynamic access for ASIC block chip resources. Each HFI has its
+ * own range of bits for the resource so it can clear its own bits on
+ * starting and exiting. If either HFI has the resource bit set, the
+ * resource is in use. The separate bit ranges are:
+ * HFI0 bits 7:0
+ * HFI1 bits 15:8
+ */
+#define CR_SBUS 0x01 /* SBUS, THERM, and PCIE registers */
+#define CR_EPROM 0x02 /* EEP, GPIO registers */
+#define CR_I2C1 0x04 /* QSFP1_OE register */
+#define CR_I2C2 0x08 /* QSFP2_OE register */
+#define CR_DYN_SHIFT 8 /* dynamic flag shift */
+#define CR_DYN_MASK ((1ull << CR_DYN_SHIFT) - 1)
+
+/*
+ * Bitmask of static ASIC states these are outside of the dynamic ASIC
+ * block chip resources above. These are to be set once and never cleared.
+ * Must be holding the SBus dynamic flag when setting.
+ */
+#define CR_THERM_INIT 0x010000
+
+int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait);
+void release_chip_resource(struct hfi1_devdata *dd, u32 resource);
+bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
+ const char *func);
+void init_chip_resources(struct hfi1_devdata *dd);
+void finish_chip_resources(struct hfi1_devdata *dd);
+
+/* ms wait time for access to an SBus resoure */
+#define SBUS_TIMEOUT 4000 /* long enough for a FW download and SBR */
+
+/* ms wait time for a qsfp (i2c) chain to become available */
+#define QSFP_WAIT 20000 /* long enough for FW update to the F4 uc */
+
+void fabric_serdes_reset(struct hfi1_devdata *dd);
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
+
+/* chip.c */
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b);
+void read_guid(struct hfi1_devdata *dd);
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+ u8 neigh_reason, u8 rem_reason);
+int set_link_state(struct hfi1_pportdata *, u32 state);
+int port_ltp_to_cap(int port_ltp);
+void handle_verify_cap(struct work_struct *work);
+void handle_freeze(struct work_struct *work);
+void handle_link_up(struct work_struct *work);
+void handle_link_down(struct work_struct *work);
+void handle_link_downgrade(struct work_struct *work);
+void handle_link_bounce(struct work_struct *work);
+void handle_start_link(struct work_struct *work);
+void handle_sma_message(struct work_struct *work);
+void reset_qsfp(struct hfi1_pportdata *ppd);
+void qsfp_event(struct work_struct *work);
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
+int send_idle_sma(struct hfi1_devdata *dd, u64 message);
+int load_8051_config(struct hfi1_devdata *, u8, u8, u32);
+int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *);
+int start_link(struct hfi1_pportdata *ppd);
+int bringup_serdes(struct hfi1_pportdata *ppd);
+void set_intr_state(struct hfi1_devdata *dd, u32 enable);
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
+ int refresh_widths);
+void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32);
+int stop_drain_data_vls(struct hfi1_devdata *dd);
+int open_fill_data_vls(struct hfi1_devdata *dd);
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
+void get_linkup_link_widths(struct hfi1_pportdata *ppd);
+void read_ltp_rtt(struct hfi1_devdata *dd);
+void clear_linkup_counters(struct hfi1_devdata *dd);
+u32 hdrqempty(struct hfi1_ctxtdata *rcd);
+int is_ax(struct hfi1_devdata *dd);
+int is_bx(struct hfi1_devdata *dd);
+u32 read_physical_state(struct hfi1_devdata *dd);
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
+u32 get_logical_state(struct hfi1_pportdata *ppd);
+const char *opa_lstate_name(u32 lstate);
+const char *opa_pstate_name(u32 pstate);
+u32 driver_physical_state(struct hfi1_pportdata *ppd);
+u32 driver_logical_state(struct hfi1_pportdata *ppd);
+
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+#define LCB_START DC_LCB_CSRS
+#define LCB_END DC_8051_CSRS /* next block is 8051 */
+static inline int is_lcb_offset(u32 offset)
+{
+ return (offset >= LCB_START && offset < LCB_END);
+}
+
+extern uint num_vls;
+
+extern uint disable_integrity;
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
+u32 read_logical_state(struct hfi1_devdata *dd);
+void force_recv_intr(struct hfi1_ctxtdata *rcd);
+
+/* Per VL indexes */
+enum {
+ C_VL_0 = 0,
+ C_VL_1,
+ C_VL_2,
+ C_VL_3,
+ C_VL_4,
+ C_VL_5,
+ C_VL_6,
+ C_VL_7,
+ C_VL_15,
+ C_VL_COUNT
+};
+
+static inline int vl_from_idx(int idx)
+{
+ return (idx == C_VL_15 ? 15 : idx);
+}
+
+static inline int idx_from_vl(int vl)
+{
+ return (vl == 15 ? C_VL_15 : vl);
+}
+
+/* Per device counter indexes */
+enum {
+ C_RCV_OVF = 0,
+ C_RX_TID_FULL,
+ C_RX_TID_INVALID,
+ C_RX_TID_FLGMS,
+ C_RX_CTX_EGRS,
+ C_RCV_TID_FLSMS,
+ C_CCE_PCI_CR_ST,
+ C_CCE_PCI_TR_ST,
+ C_CCE_PIO_WR_ST,
+ C_CCE_ERR_INT,
+ C_CCE_SDMA_INT,
+ C_CCE_MISC_INT,
+ C_CCE_RCV_AV_INT,
+ C_CCE_RCV_URG_INT,
+ C_CCE_SEND_CR_INT,
+ C_DC_UNC_ERR,
+ C_DC_RCV_ERR,
+ C_DC_FM_CFG_ERR,
+ C_DC_RMT_PHY_ERR,
+ C_DC_DROPPED_PKT,
+ C_DC_MC_XMIT_PKTS,
+ C_DC_MC_RCV_PKTS,
+ C_DC_XMIT_CERR,
+ C_DC_RCV_CERR,
+ C_DC_RCV_FCC,
+ C_DC_XMIT_FCC,
+ C_DC_XMIT_FLITS,
+ C_DC_RCV_FLITS,
+ C_DC_XMIT_PKTS,
+ C_DC_RCV_PKTS,
+ C_DC_RX_FLIT_VL,
+ C_DC_RX_PKT_VL,
+ C_DC_RCV_FCN,
+ C_DC_RCV_FCN_VL,
+ C_DC_RCV_BCN,
+ C_DC_RCV_BCN_VL,
+ C_DC_RCV_BBL,
+ C_DC_RCV_BBL_VL,
+ C_DC_MARK_FECN,
+ C_DC_MARK_FECN_VL,
+ C_DC_TOTAL_CRC,
+ C_DC_CRC_LN0,
+ C_DC_CRC_LN1,
+ C_DC_CRC_LN2,
+ C_DC_CRC_LN3,
+ C_DC_CRC_MULT_LN,
+ C_DC_TX_REPLAY,
+ C_DC_RX_REPLAY,
+ C_DC_SEQ_CRC_CNT,
+ C_DC_ESC0_ONLY_CNT,
+ C_DC_ESC0_PLUS1_CNT,
+ C_DC_ESC0_PLUS2_CNT,
+ C_DC_REINIT_FROM_PEER_CNT,
+ C_DC_SBE_CNT,
+ C_DC_MISC_FLG_CNT,
+ C_DC_PRF_GOOD_LTP_CNT,
+ C_DC_PRF_ACCEPTED_LTP_CNT,
+ C_DC_PRF_RX_FLIT_CNT,
+ C_DC_PRF_TX_FLIT_CNT,
+ C_DC_PRF_CLK_CNTR,
+ C_DC_PG_DBG_FLIT_CRDTS_CNT,
+ C_DC_PG_STS_PAUSE_COMPLETE_CNT,
+ C_DC_PG_STS_TX_SBE_CNT,
+ C_DC_PG_STS_TX_MBE_CNT,
+ C_SW_CPU_INTR,
+ C_SW_CPU_RCV_LIM,
+ C_SW_VTX_WAIT,
+ C_SW_PIO_WAIT,
+ C_SW_PIO_DRAIN,
+ C_SW_KMEM_WAIT,
+ C_SW_SEND_SCHED,
+ C_SDMA_DESC_FETCHED_CNT,
+ C_SDMA_INT_CNT,
+ C_SDMA_ERR_CNT,
+ C_SDMA_IDLE_INT_CNT,
+ C_SDMA_PROGRESS_INT_CNT,
+/* MISC_ERR_STATUS */
+ C_MISC_PLL_LOCK_FAIL_ERR,
+ C_MISC_MBIST_FAIL_ERR,
+ C_MISC_INVALID_EEP_CMD_ERR,
+ C_MISC_EFUSE_DONE_PARITY_ERR,
+ C_MISC_EFUSE_WRITE_ERR,
+ C_MISC_EFUSE_READ_BAD_ADDR_ERR,
+ C_MISC_EFUSE_CSR_PARITY_ERR,
+ C_MISC_FW_AUTH_FAILED_ERR,
+ C_MISC_KEY_MISMATCH_ERR,
+ C_MISC_SBUS_WRITE_FAILED_ERR,
+ C_MISC_CSR_WRITE_BAD_ADDR_ERR,
+ C_MISC_CSR_READ_BAD_ADDR_ERR,
+ C_MISC_CSR_PARITY_ERR,
+/* CceErrStatus */
+ /*
+ * A special counter that is the aggregate count
+ * of all the cce_err_status errors. The remainder
+ * are actual bits in the CceErrStatus register.
+ */
+ C_CCE_ERR_STATUS_AGGREGATED_CNT,
+ C_CCE_MSIX_CSR_PARITY_ERR,
+ C_CCE_INT_MAP_UNC_ERR,
+ C_CCE_INT_MAP_COR_ERR,
+ C_CCE_MSIX_TABLE_UNC_ERR,
+ C_CCE_MSIX_TABLE_COR_ERR,
+ C_CCE_RXDMA_CONV_FIFO_PARITY_ERR,
+ C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR,
+ C_CCE_SEG_WRITE_BAD_ADDR_ERR,
+ C_CCE_SEG_READ_BAD_ADDR_ERR,
+ C_LA_TRIGGERED,
+ C_CCE_TRGT_CPL_TIMEOUT_ERR,
+ C_PCIC_RECEIVE_PARITY_ERR,
+ C_PCIC_TRANSMIT_BACK_PARITY_ERR,
+ C_PCIC_TRANSMIT_FRONT_PARITY_ERR,
+ C_PCIC_CPL_DAT_Q_UNC_ERR,
+ C_PCIC_CPL_HD_Q_UNC_ERR,
+ C_PCIC_POST_DAT_Q_UNC_ERR,
+ C_PCIC_POST_HD_Q_UNC_ERR,
+ C_PCIC_RETRY_SOT_MEM_UNC_ERR,
+ C_PCIC_RETRY_MEM_UNC_ERR,
+ C_PCIC_N_POST_DAT_Q_PARITY_ERR,
+ C_PCIC_N_POST_H_Q_PARITY_ERR,
+ C_PCIC_CPL_DAT_Q_COR_ERR,
+ C_PCIC_CPL_HD_Q_COR_ERR,
+ C_PCIC_POST_DAT_Q_COR_ERR,
+ C_PCIC_POST_HD_Q_COR_ERR,
+ C_PCIC_RETRY_SOT_MEM_COR_ERR,
+ C_PCIC_RETRY_MEM_COR_ERR,
+ C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR,
+ C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR,
+ C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR,
+ C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR,
+ C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR,
+ C_CCE_CSR_CFG_BUS_PARITY_ERR,
+ C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR,
+ C_CCE_RSPD_DATA_PARITY_ERR,
+ C_CCE_TRGT_ACCESS_ERR,
+ C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR,
+ C_CCE_CSR_WRITE_BAD_ADDR_ERR,
+ C_CCE_CSR_READ_BAD_ADDR_ERR,
+ C_CCE_CSR_PARITY_ERR,
+/* RcvErrStatus */
+ C_RX_CSR_PARITY_ERR,
+ C_RX_CSR_WRITE_BAD_ADDR_ERR,
+ C_RX_CSR_READ_BAD_ADDR_ERR,
+ C_RX_DMA_CSR_UNC_ERR,
+ C_RX_DMA_DQ_FSM_ENCODING_ERR,
+ C_RX_DMA_EQ_FSM_ENCODING_ERR,
+ C_RX_DMA_CSR_PARITY_ERR,
+ C_RX_RBUF_DATA_COR_ERR,
+ C_RX_RBUF_DATA_UNC_ERR,
+ C_RX_DMA_DATA_FIFO_RD_COR_ERR,
+ C_RX_DMA_DATA_FIFO_RD_UNC_ERR,
+ C_RX_DMA_HDR_FIFO_RD_COR_ERR,
+ C_RX_DMA_HDR_FIFO_RD_UNC_ERR,
+ C_RX_RBUF_DESC_PART2_COR_ERR,
+ C_RX_RBUF_DESC_PART2_UNC_ERR,
+ C_RX_RBUF_DESC_PART1_COR_ERR,
+ C_RX_RBUF_DESC_PART1_UNC_ERR,
+ C_RX_HQ_INTR_FSM_ERR,
+ C_RX_HQ_INTR_CSR_PARITY_ERR,
+ C_RX_LOOKUP_CSR_PARITY_ERR,
+ C_RX_LOOKUP_RCV_ARRAY_COR_ERR,
+ C_RX_LOOKUP_RCV_ARRAY_UNC_ERR,
+ C_RX_LOOKUP_DES_PART2_PARITY_ERR,
+ C_RX_LOOKUP_DES_PART1_UNC_COR_ERR,
+ C_RX_LOOKUP_DES_PART1_UNC_ERR,
+ C_RX_RBUF_NEXT_FREE_BUF_COR_ERR,
+ C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR,
+ C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR,
+ C_RX_RBUF_FL_INITDONE_PARITY_ERR,
+ C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR,
+ C_RX_RBUF_FL_RD_ADDR_PARITY_ERR,
+ C_RX_RBUF_EMPTY_ERR,
+ C_RX_RBUF_FULL_ERR,
+ C_RX_RBUF_BAD_LOOKUP_ERR,
+ C_RX_RBUF_CTX_ID_PARITY_ERR,
+ C_RX_RBUF_CSR_QEOPDW_PARITY_ERR,
+ C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR,
+ C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR,
+ C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR,
+ C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR,
+ C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR,
+ C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR,
+ C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR,
+ C_RX_RBUF_BLOCK_LIST_READ_COR_ERR,
+ C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR,
+ C_RX_RBUF_LOOKUP_DES_COR_ERR,
+ C_RX_RBUF_LOOKUP_DES_UNC_ERR,
+ C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR,
+ C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR,
+ C_RX_RBUF_FREE_LIST_COR_ERR,
+ C_RX_RBUF_FREE_LIST_UNC_ERR,
+ C_RX_RCV_FSM_ENCODING_ERR,
+ C_RX_DMA_FLAG_COR_ERR,
+ C_RX_DMA_FLAG_UNC_ERR,
+ C_RX_DC_SOP_EOP_PARITY_ERR,
+ C_RX_RCV_CSR_PARITY_ERR,
+ C_RX_RCV_QP_MAP_TABLE_COR_ERR,
+ C_RX_RCV_QP_MAP_TABLE_UNC_ERR,
+ C_RX_RCV_DATA_COR_ERR,
+ C_RX_RCV_DATA_UNC_ERR,
+ C_RX_RCV_HDR_COR_ERR,
+ C_RX_RCV_HDR_UNC_ERR,
+ C_RX_DC_INTF_PARITY_ERR,
+ C_RX_DMA_CSR_COR_ERR,
+/* SendPioErrStatus */
+ C_PIO_PEC_SOP_HEAD_PARITY_ERR,
+ C_PIO_PCC_SOP_HEAD_PARITY_ERR,
+ C_PIO_LAST_RETURNED_CNT_PARITY_ERR,
+ C_PIO_CURRENT_FREE_CNT_PARITY_ERR,
+ C_PIO_RSVD_31_ERR,
+ C_PIO_RSVD_30_ERR,
+ C_PIO_PPMC_SOP_LEN_ERR,
+ C_PIO_PPMC_BQC_MEM_PARITY_ERR,
+ C_PIO_VL_FIFO_PARITY_ERR,
+ C_PIO_VLF_SOP_PARITY_ERR,
+ C_PIO_VLF_V1_LEN_PARITY_ERR,
+ C_PIO_BLOCK_QW_COUNT_PARITY_ERR,
+ C_PIO_WRITE_QW_VALID_PARITY_ERR,
+ C_PIO_STATE_MACHINE_ERR,
+ C_PIO_WRITE_DATA_PARITY_ERR,
+ C_PIO_HOST_ADDR_MEM_COR_ERR,
+ C_PIO_HOST_ADDR_MEM_UNC_ERR,
+ C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR,
+ C_PIO_INIT_SM_IN_ERR,
+ C_PIO_PPMC_PBL_FIFO_ERR,
+ C_PIO_CREDIT_RET_FIFO_PARITY_ERR,
+ C_PIO_V1_LEN_MEM_BANK1_COR_ERR,
+ C_PIO_V1_LEN_MEM_BANK0_COR_ERR,
+ C_PIO_V1_LEN_MEM_BANK1_UNC_ERR,
+ C_PIO_V1_LEN_MEM_BANK0_UNC_ERR,
+ C_PIO_SM_PKT_RESET_PARITY_ERR,
+ C_PIO_PKT_EVICT_FIFO_PARITY_ERR,
+ C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR,
+ C_PIO_SBRDCTL_CRREL_PARITY_ERR,
+ C_PIO_PEC_FIFO_PARITY_ERR,
+ C_PIO_PCC_FIFO_PARITY_ERR,
+ C_PIO_SB_MEM_FIFO1_ERR,
+ C_PIO_SB_MEM_FIFO0_ERR,
+ C_PIO_CSR_PARITY_ERR,
+ C_PIO_WRITE_ADDR_PARITY_ERR,
+ C_PIO_WRITE_BAD_CTXT_ERR,
+/* SendDmaErrStatus */
+ C_SDMA_PCIE_REQ_TRACKING_COR_ERR,
+ C_SDMA_PCIE_REQ_TRACKING_UNC_ERR,
+ C_SDMA_CSR_PARITY_ERR,
+ C_SDMA_RPY_TAG_ERR,
+/* SendEgressErrStatus */
+ C_TX_READ_PIO_MEMORY_CSR_UNC_ERR,
+ C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR,
+ C_TX_EGRESS_FIFO_COR_ERR,
+ C_TX_READ_PIO_MEMORY_COR_ERR,
+ C_TX_READ_SDMA_MEMORY_COR_ERR,
+ C_TX_SB_HDR_COR_ERR,
+ C_TX_CREDIT_OVERRUN_ERR,
+ C_TX_LAUNCH_FIFO8_COR_ERR,
+ C_TX_LAUNCH_FIFO7_COR_ERR,
+ C_TX_LAUNCH_FIFO6_COR_ERR,
+ C_TX_LAUNCH_FIFO5_COR_ERR,
+ C_TX_LAUNCH_FIFO4_COR_ERR,
+ C_TX_LAUNCH_FIFO3_COR_ERR,
+ C_TX_LAUNCH_FIFO2_COR_ERR,
+ C_TX_LAUNCH_FIFO1_COR_ERR,
+ C_TX_LAUNCH_FIFO0_COR_ERR,
+ C_TX_CREDIT_RETURN_VL_ERR,
+ C_TX_HCRC_INSERTION_ERR,
+ C_TX_EGRESS_FIFI_UNC_ERR,
+ C_TX_READ_PIO_MEMORY_UNC_ERR,
+ C_TX_READ_SDMA_MEMORY_UNC_ERR,
+ C_TX_SB_HDR_UNC_ERR,
+ C_TX_CREDIT_RETURN_PARITY_ERR,
+ C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR,
+ C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR,
+ C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR,
+ C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR,
+ C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR,
+ C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR,
+ C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR,
+ C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR,
+ C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR,
+ C_TX_SDMA15_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA14_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA13_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA12_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA11_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA10_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA9_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA8_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA7_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA6_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA5_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA4_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA3_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA2_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA1_DISALLOWED_PACKET_ERR,
+ C_TX_SDMA0_DISALLOWED_PACKET_ERR,
+ C_TX_CONFIG_PARITY_ERR,
+ C_TX_SBRD_CTL_CSR_PARITY_ERR,
+ C_TX_LAUNCH_CSR_PARITY_ERR,
+ C_TX_ILLEGAL_CL_ERR,
+ C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR,
+ C_TX_RESERVED_10,
+ C_TX_RESERVED_9,
+ C_TX_SDMA_LAUNCH_INTF_PARITY_ERR,
+ C_TX_PIO_LAUNCH_INTF_PARITY_ERR,
+ C_TX_RESERVED_6,
+ C_TX_INCORRECT_LINK_STATE_ERR,
+ C_TX_LINK_DOWN_ERR,
+ C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR,
+ C_TX_RESERVED_2,
+ C_TX_PKT_INTEGRITY_MEM_UNC_ERR,
+ C_TX_PKT_INTEGRITY_MEM_COR_ERR,
+/* SendErrStatus */
+ C_SEND_CSR_WRITE_BAD_ADDR_ERR,
+ C_SEND_CSR_READ_BAD_ADD_ERR,
+ C_SEND_CSR_PARITY_ERR,
+/* SendCtxtErrStatus */
+ C_PIO_WRITE_OUT_OF_BOUNDS_ERR,
+ C_PIO_WRITE_OVERFLOW_ERR,
+ C_PIO_WRITE_CROSSES_BOUNDARY_ERR,
+ C_PIO_DISALLOWED_PACKET_ERR,
+ C_PIO_INCONSISTENT_SOP_ERR,
+/*SendDmaEngErrStatus */
+ C_SDMA_HEADER_REQUEST_FIFO_COR_ERR,
+ C_SDMA_HEADER_STORAGE_COR_ERR,
+ C_SDMA_PACKET_TRACKING_COR_ERR,
+ C_SDMA_ASSEMBLY_COR_ERR,
+ C_SDMA_DESC_TABLE_COR_ERR,
+ C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR,
+ C_SDMA_HEADER_STORAGE_UNC_ERR,
+ C_SDMA_PACKET_TRACKING_UNC_ERR,
+ C_SDMA_ASSEMBLY_UNC_ERR,
+ C_SDMA_DESC_TABLE_UNC_ERR,
+ C_SDMA_TIMEOUT_ERR,
+ C_SDMA_HEADER_LENGTH_ERR,
+ C_SDMA_HEADER_ADDRESS_ERR,
+ C_SDMA_HEADER_SELECT_ERR,
+ C_SMDA_RESERVED_9,
+ C_SDMA_PACKET_DESC_OVERFLOW_ERR,
+ C_SDMA_LENGTH_MISMATCH_ERR,
+ C_SDMA_HALT_ERR,
+ C_SDMA_MEM_READ_ERR,
+ C_SDMA_FIRST_DESC_ERR,
+ C_SDMA_TAIL_OUT_OF_BOUNDS_ERR,
+ C_SDMA_TOO_LONG_ERR,
+ C_SDMA_GEN_MISMATCH_ERR,
+ C_SDMA_WRONG_DW_ERR,
+ DEV_CNTR_LAST /* Must be kept last */
+};
+
+/* Per port counter indexes */
+enum {
+ C_TX_UNSUP_VL = 0,
+ C_TX_INVAL_LEN,
+ C_TX_MM_LEN_ERR,
+ C_TX_UNDERRUN,
+ C_TX_FLOW_STALL,
+ C_TX_DROPPED,
+ C_TX_HDR_ERR,
+ C_TX_PKT,
+ C_TX_WORDS,
+ C_TX_WAIT,
+ C_TX_FLIT_VL,
+ C_TX_PKT_VL,
+ C_TX_WAIT_VL,
+ C_RX_PKT,
+ C_RX_WORDS,
+ C_SW_LINK_DOWN,
+ C_SW_LINK_UP,
+ C_SW_UNKNOWN_FRAME,
+ C_SW_XMIT_DSCD,
+ C_SW_XMIT_DSCD_VL,
+ C_SW_XMIT_CSTR_ERR,
+ C_SW_RCV_CSTR_ERR,
+ C_SW_IBP_LOOP_PKTS,
+ C_SW_IBP_RC_RESENDS,
+ C_SW_IBP_RNR_NAKS,
+ C_SW_IBP_OTHER_NAKS,
+ C_SW_IBP_RC_TIMEOUTS,
+ C_SW_IBP_PKT_DROPS,
+ C_SW_IBP_DMA_WAIT,
+ C_SW_IBP_RC_SEQNAK,
+ C_SW_IBP_RC_DUPREQ,
+ C_SW_IBP_RDMA_SEQ,
+ C_SW_IBP_UNALIGNED,
+ C_SW_IBP_SEQ_NAK,
+ C_SW_CPU_RC_ACKS,
+ C_SW_CPU_RC_QACKS,
+ C_SW_CPU_RC_DELAYED_COMP,
+ C_RCV_HDR_OVF_0,
+ C_RCV_HDR_OVF_1,
+ C_RCV_HDR_OVF_2,
+ C_RCV_HDR_OVF_3,
+ C_RCV_HDR_OVF_4,
+ C_RCV_HDR_OVF_5,
+ C_RCV_HDR_OVF_6,
+ C_RCV_HDR_OVF_7,
+ C_RCV_HDR_OVF_8,
+ C_RCV_HDR_OVF_9,
+ C_RCV_HDR_OVF_10,
+ C_RCV_HDR_OVF_11,
+ C_RCV_HDR_OVF_12,
+ C_RCV_HDR_OVF_13,
+ C_RCV_HDR_OVF_14,
+ C_RCV_HDR_OVF_15,
+ C_RCV_HDR_OVF_16,
+ C_RCV_HDR_OVF_17,
+ C_RCV_HDR_OVF_18,
+ C_RCV_HDR_OVF_19,
+ C_RCV_HDR_OVF_20,
+ C_RCV_HDR_OVF_21,
+ C_RCV_HDR_OVF_22,
+ C_RCV_HDR_OVF_23,
+ C_RCV_HDR_OVF_24,
+ C_RCV_HDR_OVF_25,
+ C_RCV_HDR_OVF_26,
+ C_RCV_HDR_OVF_27,
+ C_RCV_HDR_OVF_28,
+ C_RCV_HDR_OVF_29,
+ C_RCV_HDR_OVF_30,
+ C_RCV_HDR_OVF_31,
+ C_RCV_HDR_OVF_32,
+ C_RCV_HDR_OVF_33,
+ C_RCV_HDR_OVF_34,
+ C_RCV_HDR_OVF_35,
+ C_RCV_HDR_OVF_36,
+ C_RCV_HDR_OVF_37,
+ C_RCV_HDR_OVF_38,
+ C_RCV_HDR_OVF_39,
+ C_RCV_HDR_OVF_40,
+ C_RCV_HDR_OVF_41,
+ C_RCV_HDR_OVF_42,
+ C_RCV_HDR_OVF_43,
+ C_RCV_HDR_OVF_44,
+ C_RCV_HDR_OVF_45,
+ C_RCV_HDR_OVF_46,
+ C_RCV_HDR_OVF_47,
+ C_RCV_HDR_OVF_48,
+ C_RCV_HDR_OVF_49,
+ C_RCV_HDR_OVF_50,
+ C_RCV_HDR_OVF_51,
+ C_RCV_HDR_OVF_52,
+ C_RCV_HDR_OVF_53,
+ C_RCV_HDR_OVF_54,
+ C_RCV_HDR_OVF_55,
+ C_RCV_HDR_OVF_56,
+ C_RCV_HDR_OVF_57,
+ C_RCV_HDR_OVF_58,
+ C_RCV_HDR_OVF_59,
+ C_RCV_HDR_OVF_60,
+ C_RCV_HDR_OVF_61,
+ C_RCV_HDR_OVF_62,
+ C_RCV_HDR_OVF_63,
+ C_RCV_HDR_OVF_64,
+ C_RCV_HDR_OVF_65,
+ C_RCV_HDR_OVF_66,
+ C_RCV_HDR_OVF_67,
+ C_RCV_HDR_OVF_68,
+ C_RCV_HDR_OVF_69,
+ C_RCV_HDR_OVF_70,
+ C_RCV_HDR_OVF_71,
+ C_RCV_HDR_OVF_72,
+ C_RCV_HDR_OVF_73,
+ C_RCV_HDR_OVF_74,
+ C_RCV_HDR_OVF_75,
+ C_RCV_HDR_OVF_76,
+ C_RCV_HDR_OVF_77,
+ C_RCV_HDR_OVF_78,
+ C_RCV_HDR_OVF_79,
+ C_RCV_HDR_OVF_80,
+ C_RCV_HDR_OVF_81,
+ C_RCV_HDR_OVF_82,
+ C_RCV_HDR_OVF_83,
+ C_RCV_HDR_OVF_84,
+ C_RCV_HDR_OVF_85,
+ C_RCV_HDR_OVF_86,
+ C_RCV_HDR_OVF_87,
+ C_RCV_HDR_OVF_88,
+ C_RCV_HDR_OVF_89,
+ C_RCV_HDR_OVF_90,
+ C_RCV_HDR_OVF_91,
+ C_RCV_HDR_OVF_92,
+ C_RCV_HDR_OVF_93,
+ C_RCV_HDR_OVF_94,
+ C_RCV_HDR_OVF_95,
+ C_RCV_HDR_OVF_96,
+ C_RCV_HDR_OVF_97,
+ C_RCV_HDR_OVF_98,
+ C_RCV_HDR_OVF_99,
+ C_RCV_HDR_OVF_100,
+ C_RCV_HDR_OVF_101,
+ C_RCV_HDR_OVF_102,
+ C_RCV_HDR_OVF_103,
+ C_RCV_HDR_OVF_104,
+ C_RCV_HDR_OVF_105,
+ C_RCV_HDR_OVF_106,
+ C_RCV_HDR_OVF_107,
+ C_RCV_HDR_OVF_108,
+ C_RCV_HDR_OVF_109,
+ C_RCV_HDR_OVF_110,
+ C_RCV_HDR_OVF_111,
+ C_RCV_HDR_OVF_112,
+ C_RCV_HDR_OVF_113,
+ C_RCV_HDR_OVF_114,
+ C_RCV_HDR_OVF_115,
+ C_RCV_HDR_OVF_116,
+ C_RCV_HDR_OVF_117,
+ C_RCV_HDR_OVF_118,
+ C_RCV_HDR_OVF_119,
+ C_RCV_HDR_OVF_120,
+ C_RCV_HDR_OVF_121,
+ C_RCV_HDR_OVF_122,
+ C_RCV_HDR_OVF_123,
+ C_RCV_HDR_OVF_124,
+ C_RCV_HDR_OVF_125,
+ C_RCV_HDR_OVF_126,
+ C_RCV_HDR_OVF_127,
+ C_RCV_HDR_OVF_128,
+ C_RCV_HDR_OVF_129,
+ C_RCV_HDR_OVF_130,
+ C_RCV_HDR_OVF_131,
+ C_RCV_HDR_OVF_132,
+ C_RCV_HDR_OVF_133,
+ C_RCV_HDR_OVF_134,
+ C_RCV_HDR_OVF_135,
+ C_RCV_HDR_OVF_136,
+ C_RCV_HDR_OVF_137,
+ C_RCV_HDR_OVF_138,
+ C_RCV_HDR_OVF_139,
+ C_RCV_HDR_OVF_140,
+ C_RCV_HDR_OVF_141,
+ C_RCV_HDR_OVF_142,
+ C_RCV_HDR_OVF_143,
+ C_RCV_HDR_OVF_144,
+ C_RCV_HDR_OVF_145,
+ C_RCV_HDR_OVF_146,
+ C_RCV_HDR_OVF_147,
+ C_RCV_HDR_OVF_148,
+ C_RCV_HDR_OVF_149,
+ C_RCV_HDR_OVF_150,
+ C_RCV_HDR_OVF_151,
+ C_RCV_HDR_OVF_152,
+ C_RCV_HDR_OVF_153,
+ C_RCV_HDR_OVF_154,
+ C_RCV_HDR_OVF_155,
+ C_RCV_HDR_OVF_156,
+ C_RCV_HDR_OVF_157,
+ C_RCV_HDR_OVF_158,
+ C_RCV_HDR_OVF_159,
+ PORT_CNTR_LAST /* Must be kept last */
+};
+
+u64 get_all_cpu_total(u64 __percpu *cntr);
+void hfi1_start_cleanup(struct hfi1_devdata *dd);
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
+struct hfi1_message_header *hfi1_get_msgheader(
+ struct hfi1_devdata *dd, __le32 *rhf_addr);
+int hfi1_init_ctxt(struct send_context *sc);
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+ u32 type, unsigned long pa, u16 order);
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp);
+u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp);
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
+
+/*
+ * Interrupt source table.
+ *
+ * Each entry is an interrupt source "type". It is ordered by increasing
+ * number.
+ */
+struct is_table {
+ int start; /* interrupt source type start */
+ int end; /* interrupt source type end */
+ /* routine that returns the name of the interrupt source */
+ char *(*is_name)(char *name, size_t size, unsigned int source);
+ /* routine to call when receiving an interrupt */
+ void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
+};
+
+#endif /* _CHIP_H */
diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h
new file mode 100644
index 000000000000..5b9993899789
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/chip_registers.h
@@ -0,0 +1,1311 @@
+#ifndef DEF_CHIP_REG
+#define DEF_CHIP_REG
+
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define CORE 0x000000000000
+#define CCE (CORE + 0x000000000000)
+#define ASIC (CORE + 0x000000400000)
+#define MISC (CORE + 0x000000500000)
+#define DC_TOP_CSRS (CORE + 0x000000600000)
+#define CHIP_DEBUG (CORE + 0x000000700000)
+#define RXE (CORE + 0x000001000000)
+#define TXE (CORE + 0x000001800000)
+#define DCC_CSRS (DC_TOP_CSRS + 0x000000000000)
+#define DC_LCB_CSRS (DC_TOP_CSRS + 0x000000001000)
+#define DC_8051_CSRS (DC_TOP_CSRS + 0x000000002000)
+#define PCIE 0
+
+#define ASIC_NUM_SCRATCH 4
+#define CCE_ERR_INT_CNT 0
+#define CCE_MISC_INT_CNT 2
+#define CCE_NUM_32_BIT_COUNTERS 3
+#define CCE_NUM_32_BIT_INT_COUNTERS 6
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define CCE_NUM_MSIX_PBAS 4
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_SCRATCH 4
+#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
+#define CCE_PCIE_TRGT_STALL_CNT 0
+#define CCE_PIO_WR_STALL_CNT 1
+#define CCE_RCV_AVAIL_INT_CNT 3
+#define CCE_RCV_URGENT_INT_CNT 4
+#define CCE_SDMA_INT_CNT 1
+#define CCE_SEND_CREDIT_INT_CNT 5
+#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
+#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
+#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
+#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
+#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
+#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
+#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
+#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
+#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
+#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
+#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
+#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
+#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
+#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
+#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
+#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
+#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
+#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
+#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
+#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
+#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
+#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
+#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
+#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
+#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
+#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
+#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
+#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
+#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
+#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
+#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
+#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
+#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
+#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
+#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
+#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
+#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
+#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
+#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
+#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
+#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
+#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
+#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
+#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
+#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
+#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
+#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
+#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
+#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
+#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
+#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
+#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
+#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
+#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
+#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
+#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
+#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
+#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
+#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
+#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
+#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
+#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
+#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
+#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
+#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
+#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
+#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
+#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
+#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
+#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
+#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
+#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
+#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
+#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
+#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
+#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
+#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
+#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
+#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
+#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
+#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
+#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
+#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
+#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
+#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
+#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
+#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
+#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
+#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
+#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
+#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
+#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
+#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
+#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
+#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
+#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
+#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
+#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
+#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
+#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
+#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
+#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
+#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
+#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
+#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
+#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
+#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
+#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
+#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
+#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
+#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
+#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
+#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
+#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
+#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
+#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
+#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
+#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
+#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
+#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
+#define DC_LCB_CFG_RUN_EN_SHIFT 0
+#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
+#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
+#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
+#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
+#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
+#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
+#define DC_LCB_CFG_REINIT_AS_SLAVE (DC_LCB_CSRS + 0x000000000030)
+#define DC_LCB_CFG_CNT_FOR_SKIP_STALL (DC_LCB_CSRS + 0x000000000040)
+#define DC_LCB_CFG_CLK_CNTR (DC_LCB_CSRS + 0x000000000110)
+#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
+#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
+#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
+#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
+#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
+#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
+#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
+#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
+#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
+#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
+#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
+#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
+#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
+#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
+#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
+#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
+#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
+#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
+#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
+#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
+#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
+#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
+#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
+#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
+#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
+#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
+#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
+#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
+#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
+#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
+#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
+#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
+#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
+#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
+#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
+#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
+#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
+#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
+#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
+#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_BUF_OVFL_CNT 10
+#define RCV_CONTEXT_EGR_STALL 22
+#define RCV_DATA_PKT_CNT 0
+#define RCV_DWORD_CNT 1
+#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
+#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
+#define RCV_TID_FULL_ERR_CNT 18
+#define RCV_TID_VALID_ERR_CNT 19
+#define RXE_NUM_32_BIT_COUNTERS 24
+#define RXE_NUM_64_BIT_COUNTERS 2
+#define RXE_NUM_RSM_INSTANCES 4
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_PER_CONTEXT_OFFSET 0x0300000
+#define SEND_DATA_PKT_CNT 0
+#define SEND_DATA_PKT_VL0_CNT 12
+#define SEND_DATA_VL0_CNT 3
+#define SEND_DROPPED_PKT_CNT 5
+#define SEND_DWORD_CNT 1
+#define SEND_FLOW_STALL_CNT 4
+#define SEND_HEADERS_ERR_CNT 6
+#define SEND_LEN_ERR_CNT 1
+#define SEND_MAX_MIN_LEN_ERR_CNT 2
+#define SEND_UNDERRUN_CNT 3
+#define SEND_UNSUP_VL_ERR_CNT 0
+#define SEND_WAIT_CNT 2
+#define SEND_WAIT_VL0_CNT 21
+#define TXE_PIO_SEND_OFFSET 0x0800000
+#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
+#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
+#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
+#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
+#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
+#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
+#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
+#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
+#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
+#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
+#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
+#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
+#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
+#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
+#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
+#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
+#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
+#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
+#define ASIC_EEP_DATA (ASIC + 0x000000000310)
+#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
+#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
+#define ASIC_GPIO_IN (ASIC + 0x000000000200)
+#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
+#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
+#define ASIC_GPIO_OE (ASIC + 0x000000000208)
+#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
+#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
+#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
+#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
+#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
+#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
+#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
+#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
+#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
+#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
+#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
+#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
+#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
+#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
+#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
+#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
+#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
+#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
+#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
+#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
+#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
+#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
+#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
+#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
+#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
+#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
+#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
+#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
+#define ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT 2
+#define ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK 0x7ull
+#define ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT 32
+#define ASIC_STS_SBUS_RESULT_DATA_OUT_MASK 0xFFFFFFFFull
+#define ASIC_STS_THERM (ASIC + 0x000000000058)
+#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
+#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
+#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
+#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
+#define ASIC_STS_THERM_LOW_SHIFT 13
+#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
+#define CCE_CTRL (CCE + 0x000000000010)
+#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
+#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
+#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
+#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
+#define CCE_DC_CTRL (CCE + 0x0000000000B8)
+#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
+#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
+#define CCE_ERR_CLEAR (CCE + 0x000000000050)
+#define CCE_ERR_MASK (CCE + 0x000000000048)
+#define CCE_ERR_STATUS (CCE + 0x000000000040)
+#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
+ 0x200ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
+ 0x800ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
+ 0x400ull
+#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
+#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
+#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
+#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
+#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
+#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
+#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
+#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
+#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
+#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
+#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
+#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
+#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
+#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
+#define CCE_INT_CLEAR (CCE + 0x000000110A00)
+#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
+#define CCE_INT_FORCE (CCE + 0x000000110B00)
+#define CCE_INT_MAP (CCE + 0x000000110500)
+#define CCE_INT_MASK (CCE + 0x000000110900)
+#define CCE_INT_STATUS (CCE + 0x000000110800)
+#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
+#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
+#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
+#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
+#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
+#define CCE_PCIE_CTRL (CCE + 0x0000000000C0)
+#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK 0x3ull
+#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT 0
+#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK 0xFull
+#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT 2
+#define CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT 8
+#define CCE_PCIE_CTRL_XMT_MARGIN_SHIFT 9
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK 0x1ull
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT 12
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK 0x7ull
+#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT 13
+#define CCE_REVISION (CCE + 0x000000000000)
+#define CCE_REVISION2 (CCE + 0x000000000008)
+#define CCE_REVISION2_HFI_ID_MASK 0x1ull
+#define CCE_REVISION2_HFI_ID_SHIFT 0
+#define CCE_REVISION2_IMPL_CODE_SHIFT 8
+#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
+#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
+#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
+#define CCE_REVISION_SW_MASK 0xFFull
+#define CCE_REVISION_SW_SHIFT 24
+#define CCE_SCRATCH (CCE + 0x000000000020)
+#define CCE_STATUS (CCE + 0x000000000018)
+#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
+#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
+#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
+#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
+#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
+#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
+#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
+#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
+#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
+#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
+#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
+#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
+#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
+#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
+#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
+#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
+#define MISC_ERR_CLEAR (MISC + 0x000000002010)
+#define MISC_ERR_MASK (MISC + 0x000000002008)
+#define MISC_ERR_STATUS (MISC + 0x000000002000)
+#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
+#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
+#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
+#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
+#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
+#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
+#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
+#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
+#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
+#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
+#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
+#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
+#define PCI_CFG_REG1 (PCIE + 0x000000000004)
+#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
+#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
+#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
+#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
+#define RCV_ARRAY (RXE + 0x000000200000)
+#define RCV_ARRAY_CNT (RXE + 0x000000000018)
+#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
+#define RCV_ARRAY_RT_ADDR_SHIFT 0
+#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
+#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
+#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
+#define RCV_BTH_QP (RXE + 0x000000000028)
+#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
+#define RCV_BTH_QP_KDETH_QP_SHIFT 16
+#define RCV_BYPASS (RXE + 0x000000000038)
+#define RCV_CONTEXTS (RXE + 0x000000000010)
+#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
+#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
+#define RCV_CTRL (RXE + 0x000000000000)
+#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
+#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
+#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
+#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
+#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
+#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
+#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
+#define RCV_CTXT_CTRL (RXE + 0x000000100000)
+#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
+#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
+#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
+#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
+#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
+#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
+#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
+#define RCV_CTXT_STATUS (RXE + 0x000000100008)
+#define RCV_EGR_CTRL (RXE + 0x000000100010)
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
+#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
+#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
+#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
+#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
+#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
+#define RCV_ERR_CLEAR (RXE + 0x000000000070)
+#define RCV_ERR_INFO (RXE + 0x000000000050)
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
+#define RCV_ERR_MASK (RXE + 0x000000000068)
+#define RCV_ERR_STATUS (RXE + 0x000000000060)
+#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
+ 0x4000000000000000ull
+#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
+#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
+ 0x40000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+ 0x20000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+ 0x800000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+ 0x400000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+ 0x10000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+ 0x20000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+ 0x200000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+ 0x8000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+ 0x1000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
+#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
+#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
+#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
+#define RCV_HDR_ADDR (RXE + 0x000000100028)
+#define RCV_HDR_CNT (RXE + 0x000000100030)
+#define RCV_HDR_CNT_CNT_MASK 0x1FFull
+#define RCV_HDR_CNT_CNT_SHIFT 0
+#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
+#define RCV_HDR_HEAD (RXE + 0x000000300008)
+#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
+#define RCV_HDR_HEAD_COUNTER_SHIFT 32
+#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
+#define RCV_HDR_HEAD_HEAD_SHIFT 0
+#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
+#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
+#define RCV_HDR_SIZE (RXE + 0x000000100040)
+#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
+#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
+#define RCV_HDR_TAIL (RXE + 0x000000300000)
+#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
+#define RCV_KEY_CTRL (RXE + 0x000000100020)
+#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
+#define RCV_MULTICAST (RXE + 0x000000000030)
+#define RCV_PARTITION_KEY (RXE + 0x000000000200)
+#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
+#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
+#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
+#define RCV_RSM_CFG (RXE + 0x000000000600)
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
+#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_CFG_OFFSET_SHIFT 32
+#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
+#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
+#define RCV_RSM_MATCH (RXE + 0x000000000800)
+#define RCV_RSM_MATCH_MASK1_SHIFT 0
+#define RCV_RSM_MATCH_MASK2_SHIFT 16
+#define RCV_RSM_MATCH_VALUE1_SHIFT 8
+#define RCV_RSM_MATCH_VALUE2_SHIFT 24
+#define RCV_RSM_SELECT (RXE + 0x000000000700)
+#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
+#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
+#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
+#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
+#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
+#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
+#define RCV_STATUS (RXE + 0x000000000008)
+#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
+#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
+#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
+#define RCV_TID_CTRL (RXE + 0x000000100018)
+#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
+#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
+#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
+#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
+#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
+#define RCV_VL15 (RXE + 0x000000000048)
+#define SEND_BTH_QP (TXE + 0x0000000000A0)
+#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
+#define SEND_BTH_QP_KDETH_QP_SHIFT 16
+#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
+#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
+ 0x1000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
+ 0x8000000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
+ 0x2000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
+ 0x4000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
+ 0x8000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
+ 0x10000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
+ 0x20000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
+ 0x40000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
+ 0x80000000000000ull
+#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
+#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
+#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
+#define SEND_CM_CTRL (TXE + 0x000000000500)
+#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
+#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
+#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
+#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
+#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
+#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
+#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
+#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
+#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
+#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
+#define SEND_CONTEXTS (TXE + 0x000000000010)
+#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
+#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
+#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
+#define SEND_CTRL (TXE + 0x000000000000)
+#define SEND_CTRL_CM_RESET_SMASK 0x4ull
+#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
+#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
+ 0x200000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+ 0x100000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+ 0x80000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
+ 0x40000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+ 0x8000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
+ 0x4000ull
+#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
+#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
+#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
+#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
+#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
+#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
+#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
+#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
+#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
+#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
+#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
+#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
+#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
+#define SEND_CTXT_CTRL (TXE + 0x000000100000)
+#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
+#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
+#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
+#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
+#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
+#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
+#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
+#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
+#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
+#define SEND_CTXT_STATUS (TXE + 0x000000100008)
+#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
+#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
+#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
+#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+ 0x100000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+ 0x80000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+ 0x8000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
+#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
+#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
+#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
+#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
+#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
+#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
+#define SEND_DMA_CTRL (TXE + 0x000000200000)
+#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
+#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
+#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
+#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
+#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
+#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
+#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
+#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
+#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
+ 0x40000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
+ 0x20000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
+ 0x100ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
+ 0x10000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
+#define SEND_DMA_ENGINES (TXE + 0x000000000018)
+#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
+#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
+#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
+#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
+#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
+#define SEND_DMA_HEAD (TXE + 0x000000200028)
+#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
+#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
+#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
+#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
+#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
+#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
+#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
+#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
+#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
+#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
+#define SEND_DMA_STATUS (TXE + 0x000000200008)
+#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
+#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
+#define SEND_DMA_TAIL (TXE + 0x000000200020)
+#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
+ 0x3FFFull
+#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
+#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
+#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
+#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
+#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
+#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
+#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
+#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
+#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
+#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
+#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
+#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
+#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
+#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
+#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
+#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
+ 0x200000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
+ 0x20000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
+ 0x800000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
+ 0x2000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
+ 0x200000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
+ 0x8ull
+#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
+ 0x400000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
+ 0x1000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
+ 0x100000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
+ 0x2000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
+ 0x200000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
+ 0x4000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
+ 0x400000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
+ 0x8000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
+ 0x800000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
+ 0x10000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
+ 0x1000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
+ 0x20000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
+ 0x2000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
+ 0x40000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
+ 0x4000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
+ 0x80000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
+ 0x8000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
+ 0x100000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
+ 0x10000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
+ 0x1000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
+ 0x8000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
+ 0x100000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
+ 0x800000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
+ 0x4000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
+ 0x80000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
+ 0x800ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
+ 0x10000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
+ 0x4000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
+ 0x8000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
+ 0x10000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
+ 0x20000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
+ 0x40000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
+ 0x80000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
+ 0x20000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
+ 0x40000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
+ 0x80000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
+ 0x100000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
+ 0x200000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
+ 0x400000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
+ 0x800000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
+ 0x1000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
+ 0x2000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
+ 0x100ull
+#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+ 0x3FFFull
+#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
+#define SEND_ERR_MASK (TXE + 0x0000000000E8)
+#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
+#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
+#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
+#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
+#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
+#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
+#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
+#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
+#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
+#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
+#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
+#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
+#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
+#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
+#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
+#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
+#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+ 0x1000000ull
+#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
+#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
+#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+ 0x100000000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
+#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+ 0x200000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+ 0x400000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
+ 0x800000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+ 0x100ull
+#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
+#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
+#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
+#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
+#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
+#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
+#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
+#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
+#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
+#define SEND_SC2VLT0_SC0_SHIFT 0
+#define SEND_SC2VLT0_SC1_SHIFT 8
+#define SEND_SC2VLT0_SC2_SHIFT 16
+#define SEND_SC2VLT0_SC3_SHIFT 24
+#define SEND_SC2VLT0_SC4_SHIFT 32
+#define SEND_SC2VLT0_SC5_SHIFT 40
+#define SEND_SC2VLT0_SC6_SHIFT 48
+#define SEND_SC2VLT0_SC7_SHIFT 56
+#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
+#define SEND_SC2VLT1_SC10_SHIFT 16
+#define SEND_SC2VLT1_SC11_SHIFT 24
+#define SEND_SC2VLT1_SC12_SHIFT 32
+#define SEND_SC2VLT1_SC13_SHIFT 40
+#define SEND_SC2VLT1_SC14_SHIFT 48
+#define SEND_SC2VLT1_SC15_SHIFT 56
+#define SEND_SC2VLT1_SC8_SHIFT 0
+#define SEND_SC2VLT1_SC9_SHIFT 8
+#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
+#define SEND_SC2VLT2_SC16_SHIFT 0
+#define SEND_SC2VLT2_SC17_SHIFT 8
+#define SEND_SC2VLT2_SC18_SHIFT 16
+#define SEND_SC2VLT2_SC19_SHIFT 24
+#define SEND_SC2VLT2_SC20_SHIFT 32
+#define SEND_SC2VLT2_SC21_SHIFT 40
+#define SEND_SC2VLT2_SC22_SHIFT 48
+#define SEND_SC2VLT2_SC23_SHIFT 56
+#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
+#define SEND_SC2VLT3_SC24_SHIFT 0
+#define SEND_SC2VLT3_SC25_SHIFT 8
+#define SEND_SC2VLT3_SC26_SHIFT 16
+#define SEND_SC2VLT3_SC27_SHIFT 24
+#define SEND_SC2VLT3_SC28_SHIFT 32
+#define SEND_SC2VLT3_SC29_SHIFT 40
+#define SEND_SC2VLT3_SC30_SHIFT 48
+#define SEND_SC2VLT3_SC31_SHIFT 56
+#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
+#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
+#define PCIE_CFG_REG_PL3 (PCIE + 0x00000000070C)
+#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT 27
+#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK 0x38000000
+#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
+#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
+#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
+#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
+#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
+#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
+#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
+#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
+#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
+#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
+#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
+#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
+#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
+#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
+#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
+#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
+#define CCE_MSIX_PBA_OFFSET 0X0110000
+
+#endif /* DEF_CHIP_REG */
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
new file mode 100644
index 000000000000..fcc9c217a97a
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -0,0 +1,411 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#include <rdma/hfi/hfi1_user.h>
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fast path code
+ * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in fast path code
+ * _HFI1_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * If a packet's QP[23:16] bits match this value, then it is
+ * a PSM packet and the hardware will expect a KDETH header
+ * following the BTH.
+ */
+#define DEFAULT_KDETH_QP 0x80
+
+/* driver/hw feature set bitmask */
+#define HFI1_CAP_USER_SHIFT 24
+#define HFI1_CAP_MASK ((1UL << HFI1_CAP_USER_SHIFT) - 1)
+/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
+#define HFI1_CAP_LOCKED_SHIFT 63
+#define HFI1_CAP_LOCKED_MASK 0x1ULL
+#define HFI1_CAP_LOCKED_SMASK (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
+/* extra bits used between kernel and user processes */
+#define HFI1_CAP_MISC_SHIFT (HFI1_CAP_USER_SHIFT * 2)
+#define HFI1_CAP_MISC_MASK ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
+ HFI1_CAP_MISC_SHIFT)) - 1)
+
+#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
+#define HFI1_CAP_KCLEAR(cap) \
+ ({ \
+ hfi1_cap_mask &= ~HFI1_CAP_##cap; \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_USET(cap) \
+ ({ \
+ hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_UCLEAR(cap) \
+ ({ \
+ hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_SET(cap) \
+ ({ \
+ hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap << \
+ HFI1_CAP_USER_SHIFT)); \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_CLEAR(cap) \
+ ({ \
+ hfi1_cap_mask &= ~(HFI1_CAP_##cap | \
+ (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
+ hfi1_cap_mask; \
+ })
+#define HFI1_CAP_LOCK() \
+ ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
+#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
+/*
+ * The set of capability bits that can be changed after initial load
+ * This set is the same for kernel and user contexts. However, for
+ * user contexts, the set can be further filtered by using the
+ * HFI1_CAP_RESERVED_MASK bits.
+ */
+#define HFI1_CAP_WRITABLE_MASK (HFI1_CAP_SDMA_AHG | \
+ HFI1_CAP_HDRSUPP | \
+ HFI1_CAP_MULTI_PKT_EGR | \
+ HFI1_CAP_NODROP_RHQ_FULL | \
+ HFI1_CAP_NODROP_EGR_FULL | \
+ HFI1_CAP_ALLOW_PERM_JKEY | \
+ HFI1_CAP_STATIC_RATE_CTRL | \
+ HFI1_CAP_PRINT_UNIMPL | \
+ HFI1_CAP_TID_UNMAP)
+/*
+ * A set of capability bits that are "global" and are not allowed to be
+ * set in the user bitmask.
+ */
+#define HFI1_CAP_RESERVED_MASK ((HFI1_CAP_SDMA | \
+ HFI1_CAP_USE_SDMA_HEAD | \
+ HFI1_CAP_EXTENDED_PSN | \
+ HFI1_CAP_PRINT_UNIMPL | \
+ HFI1_CAP_NO_INTEGRITY | \
+ HFI1_CAP_PKEY_CHECK) << \
+ HFI1_CAP_USER_SHIFT)
+/*
+ * Set of capabilities that need to be enabled for kernel context in
+ * order to be allowed for user contexts, as well.
+ */
+#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
+/* Default enabled capabilities (both kernel and user) */
+#define HFI1_CAP_MASK_DEFAULT (HFI1_CAP_HDRSUPP | \
+ HFI1_CAP_NODROP_RHQ_FULL | \
+ HFI1_CAP_NODROP_EGR_FULL | \
+ HFI1_CAP_SDMA | \
+ HFI1_CAP_PRINT_UNIMPL | \
+ HFI1_CAP_STATIC_RATE_CTRL | \
+ HFI1_CAP_PKEY_CHECK | \
+ HFI1_CAP_MULTI_PKT_EGR | \
+ HFI1_CAP_EXTENDED_PSN | \
+ ((HFI1_CAP_HDRSUPP | \
+ HFI1_CAP_MULTI_PKT_EGR | \
+ HFI1_CAP_STATIC_RATE_CTRL | \
+ HFI1_CAP_PKEY_CHECK | \
+ HFI1_CAP_EARLY_CREDIT_RETURN) << \
+ HFI1_CAP_USER_SHIFT))
+/*
+ * A bitmask of kernel/global capabilities that should be communicated
+ * to user level processes.
+ */
+#define HFI1_CAP_K2U (HFI1_CAP_SDMA | \
+ HFI1_CAP_EXTENDED_PSN | \
+ HFI1_CAP_PKEY_CHECK | \
+ HFI1_CAP_NO_INTEGRITY)
+
+#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \
+ HFI1_USER_SWMINOR)
+
+#ifndef HFI1_KERN_TYPE
+#define HFI1_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user. It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a Intel release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of hfi1_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
+
+/*
+ * Define the driver version number. This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#ifndef HFI1_DRIVER_VERSION_BASE
+#define HFI1_DRIVER_VERSION_BASE "0.9-294"
+#endif
+
+/* create the final driver version string */
+#ifdef HFI1_IDSTR
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
+#else
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * Diagnostics can send a packet by writing the following
+ * struct to the diag packet special file.
+ *
+ * This allows a custom PBC qword, so that special modes and deliberate
+ * changes to CRCs can be used.
+ */
+#define _DIAG_PKT_VERS 1
+struct diag_pkt {
+ __u16 version; /* structure version */
+ __u16 unit; /* which device */
+ __u16 sw_index; /* send sw index to use */
+ __u16 len; /* data length, in bytes */
+ __u16 port; /* port number */
+ __u16 unused;
+ __u32 flags; /* call flags */
+ __u64 data; /* user data pointer */
+ __u64 pbc; /* PBC for the packet */
+};
+
+/* diag_pkt flags */
+#define F_DIAGPKT_WAIT 0x1 /* wait until packet is sent */
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/*
+ * Receive Header Flags
+ */
+#define RHF_PKT_LEN_SHIFT 0
+#define RHF_PKT_LEN_MASK 0xfffull
+#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
+
+#define RHF_RCV_TYPE_SHIFT 12
+#define RHF_RCV_TYPE_MASK 0x7ull
+#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
+
+#define RHF_USE_EGR_BFR_SHIFT 15
+#define RHF_USE_EGR_BFR_MASK 0x1ull
+#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
+
+#define RHF_EGR_INDEX_SHIFT 16
+#define RHF_EGR_INDEX_MASK 0x7ffull
+#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
+
+#define RHF_DC_INFO_SHIFT 27
+#define RHF_DC_INFO_MASK 0x1ull
+#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
+
+#define RHF_RCV_SEQ_SHIFT 28
+#define RHF_RCV_SEQ_MASK 0xfull
+#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
+
+#define RHF_EGR_OFFSET_SHIFT 32
+#define RHF_EGR_OFFSET_MASK 0xfffull
+#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
+#define RHF_HDRQ_OFFSET_SHIFT 44
+#define RHF_HDRQ_OFFSET_MASK 0x1ffull
+#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
+#define RHF_K_HDR_LEN_ERR (0x1ull << 53)
+#define RHF_DC_UNC_ERR (0x1ull << 54)
+#define RHF_DC_ERR (0x1ull << 55)
+#define RHF_RCV_TYPE_ERR_SHIFT 56
+#define RHF_RCV_TYPE_ERR_MASK 0x7ul
+#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
+#define RHF_TID_ERR (0x1ull << 59)
+#define RHF_LEN_ERR (0x1ull << 60)
+#define RHF_ECC_ERR (0x1ull << 61)
+#define RHF_VCRC_ERR (0x1ull << 62)
+#define RHF_ICRC_ERR (0x1ull << 63)
+
+#define RHF_ERROR_SMASK 0xffe0000000000000ull /* bits 63:53 */
+
+/* RHF receive types */
+#define RHF_RCV_TYPE_EXPECTED 0
+#define RHF_RCV_TYPE_EAGER 1
+#define RHF_RCV_TYPE_IB 2 /* normal IB, IB Raw, or IPv6 */
+#define RHF_RCV_TYPE_ERROR 3
+#define RHF_RCV_TYPE_BYPASS 4
+#define RHF_RCV_TYPE_INVALID5 5
+#define RHF_RCV_TYPE_INVALID6 6
+#define RHF_RCV_TYPE_INVALID7 7
+
+/* RHF receive type error - expected packet errors */
+#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR 0x2
+#define RHF_RTE_EXPECTED_FLOW_GEN_ERR 0x4
+
+/* RHF receive type error - eager packet errors */
+#define RHF_RTE_EAGER_NO_ERR 0x0
+
+/* RHF receive type error - IB packet errors */
+#define RHF_RTE_IB_NO_ERR 0x0
+
+/* RHF receive type error - error packet errors */
+#define RHF_RTE_ERROR_NO_ERR 0x0
+#define RHF_RTE_ERROR_OP_CODE_ERR 0x1
+#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2
+#define RHF_RTE_ERROR_KHDR_HCRC_ERR 0x3
+#define RHF_RTE_ERROR_KHDR_KVER_ERR 0x4
+#define RHF_RTE_ERROR_CONTEXT_ERR 0x5
+#define RHF_RTE_ERROR_KHDR_TID_ERR 0x6
+
+/* RHF receive type error - bypass packet errors */
+#define RHF_RTE_BYPASS_NO_ERR 0x0
+
+/*
+ * This structure contains the first field common to all protocols
+ * that employ this chip.
+ */
+struct hfi1_message_header {
+ __be16 lrh[4];
+};
+
+/* IB - LRH header constants */
+#define HFI1_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */
+#define HFI1_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define LIM_MGMT_P_KEY 0x7FFF
+#define FULL_MGMT_P_KEY 0xFFFF
+
+#define DEFAULT_P_KEY LIM_MGMT_P_KEY
+#define HFI1_AETH_CREDIT_SHIFT 24
+#define HFI1_AETH_CREDIT_MASK 0x1F
+#define HFI1_AETH_CREDIT_INVAL 0x1F
+#define HFI1_MSN_MASK 0xFFFFFF
+#define HFI1_FECN_SHIFT 31
+#define HFI1_FECN_MASK 1
+#define HFI1_FECN_SMASK BIT(HFI1_FECN_SHIFT)
+#define HFI1_BECN_SHIFT 30
+#define HFI1_BECN_MASK 1
+#define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT)
+
+#define HFI1_PSM_IOC_BASE_SEQ 0x0
+
+static inline __u64 rhf_to_cpu(const __le32 *rbuf)
+{
+ return __le64_to_cpu(*((__le64 *)rbuf));
+}
+
+static inline u64 rhf_err_flags(u64 rhf)
+{
+ return rhf & RHF_ERROR_SMASK;
+}
+
+static inline u32 rhf_rcv_type(u64 rhf)
+{
+ return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
+}
+
+static inline u32 rhf_rcv_type_err(u64 rhf)
+{
+ return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
+}
+
+/* return size is in bytes, not DWORDs */
+static inline u32 rhf_pkt_len(u64 rhf)
+{
+ return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
+}
+
+static inline u32 rhf_egr_index(u64 rhf)
+{
+ return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
+}
+
+static inline u32 rhf_rcv_seq(u64 rhf)
+{
+ return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
+}
+
+/* returned offset is in DWORDS */
+static inline u32 rhf_hdrq_offset(u64 rhf)
+{
+ return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline u64 rhf_use_egr_bfr(u64 rhf)
+{
+ return rhf & RHF_USE_EGR_BFR_SMASK;
+}
+
+static inline u64 rhf_dc_info(u64 rhf)
+{
+ return rhf & RHF_DC_INFO_SMASK;
+}
+
+static inline u32 rhf_egr_buf_offset(u64 rhf)
+{
+ return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
+}
+#endif /* _COMMON_H */
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
new file mode 100644
index 000000000000..5e9be16f6cd3
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/debugfs.c
@@ -0,0 +1,1121 @@
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "debugfs.h"
+#include "device.h"
+#include "qp.h"
+#include "sdma.h"
+
+static struct dentry *hfi1_dbg_root;
+
+/* wrappers to enforce srcu in seq file */
+static ssize_t hfi1_seq_read(
+ struct file *file,
+ char __user *buf,
+ size_t size,
+ loff_t *ppos)
+{
+ struct dentry *d = file->f_path.dentry;
+ int srcu_idx;
+ ssize_t r;
+
+ r = debugfs_use_file_start(d, &srcu_idx);
+ if (likely(!r))
+ r = seq_read(file, buf, size, ppos);
+ debugfs_use_file_finish(srcu_idx);
+ return r;
+}
+
+static loff_t hfi1_seq_lseek(
+ struct file *file,
+ loff_t offset,
+ int whence)
+{
+ struct dentry *d = file->f_path.dentry;
+ int srcu_idx;
+ loff_t r;
+
+ r = debugfs_use_file_start(d, &srcu_idx);
+ if (likely(!r))
+ r = seq_lseek(file, offset, whence);
+ debugfs_use_file_finish(srcu_idx);
+ return r;
+}
+
+#define private2dd(file) (file_inode(file)->i_private)
+#define private2ppd(file) (file_inode(file)->i_private)
+
+#define DEBUGFS_SEQ_FILE_OPS(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+ .start = _##name##_seq_start, \
+ .next = _##name##_seq_next, \
+ .stop = _##name##_seq_stop, \
+ .show = _##name##_seq_show \
+}
+
+#define DEBUGFS_SEQ_FILE_OPEN(name) \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+ struct seq_file *seq; \
+ int ret; \
+ ret = seq_open(s, &_##name##_seq_ops); \
+ if (ret) \
+ return ret; \
+ seq = s->private_data; \
+ seq->private = inode->i_private; \
+ return 0; \
+}
+
+#define DEBUGFS_FILE_OPS(name) \
+static const struct file_operations _##name##_file_ops = { \
+ .owner = THIS_MODULE, \
+ .open = _##name##_open, \
+ .read = hfi1_seq_read, \
+ .llseek = hfi1_seq_lseek, \
+ .release = seq_release \
+}
+
+#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \
+do { \
+ struct dentry *ent; \
+ ent = debugfs_create_file(name, mode, parent, \
+ data, ops); \
+ if (!ent) \
+ pr_warn("create of %s failed\n", name); \
+} while (0)
+
+#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
+ DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct hfi1_opcode_stats_perctx *opstats;
+
+ if (*pos >= ARRAY_SIZE(opstats->stats))
+ return NULL;
+ return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct hfi1_opcode_stats_perctx *opstats;
+
+ ++*pos;
+ if (*pos >= ARRAY_SIZE(opstats->stats))
+ return NULL;
+ return pos;
+}
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+ loff_t *spos = v;
+ loff_t i = *spos, j;
+ u64 n_packets = 0, n_bytes = 0;
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ for (j = 0; j < dd->first_user_ctxt; j++) {
+ if (!dd->rcd[j])
+ continue;
+ n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
+ n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
+ }
+ if (!n_packets && !n_bytes)
+ return SEQ_SKIP;
+ seq_printf(s, "%02llx %llu/%llu\n", i,
+ (unsigned long long)n_packets,
+ (unsigned long long)n_bytes);
+
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(opcode_stats);
+DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
+DEBUGFS_FILE_OPS(opcode_stats);
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ if (!*pos)
+ return SEQ_START_TOKEN;
+ if (*pos >= dd->first_user_ctxt)
+ return NULL;
+ return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ if (v == SEQ_START_TOKEN)
+ return pos;
+
+ ++*pos;
+ if (*pos >= dd->first_user_ctxt)
+ return NULL;
+ return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+ /* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+ loff_t *spos;
+ loff_t i, j;
+ u64 n_packets = 0;
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(s, "Ctx:npkts\n");
+ return 0;
+ }
+
+ spos = v;
+ i = *spos;
+
+ if (!dd->rcd[i])
+ return SEQ_SKIP;
+
+ for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
+ n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
+
+ if (!n_packets)
+ return SEQ_SKIP;
+
+ seq_printf(s, " %llu:%llu\n", i, n_packets);
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(ctx_stats);
+DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
+DEBUGFS_FILE_OPS(ctx_stats);
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(RCU)
+{
+ struct qp_iter *iter;
+ loff_t n = *pos;
+
+ iter = qp_iter_init(s->private);
+
+ /* stop calls rcu_read_unlock */
+ rcu_read_lock();
+
+ if (!iter)
+ return NULL;
+
+ do {
+ if (qp_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+ } while (n--);
+
+ return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+ loff_t *pos)
+ __must_hold(RCU)
+{
+ struct qp_iter *iter = iter_ptr;
+
+ (*pos)++;
+
+ if (qp_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+ struct qp_iter *iter = iter_ptr;
+
+ if (!iter)
+ return 0;
+
+ qp_iter_print(s, iter);
+
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(qp_stats);
+DEBUGFS_SEQ_FILE_OPEN(qp_stats)
+DEBUGFS_FILE_OPS(qp_stats);
+
+static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd;
+ struct hfi1_devdata *dd;
+
+ ibd = (struct hfi1_ibdev *)s->private;
+ dd = dd_from_dev(ibd);
+ if (!dd->per_sdma || *pos >= dd->num_sdma)
+ return NULL;
+ return pos;
+}
+
+static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ ++*pos;
+ if (!dd->per_sdma || *pos >= dd->num_sdma)
+ return NULL;
+ return pos;
+}
+
+static void _sdes_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _sdes_seq_show(struct seq_file *s, void *v)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+ loff_t *spos = v;
+ loff_t i = *spos;
+
+ sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(sdes);
+DEBUGFS_SEQ_FILE_OPEN(sdes)
+DEBUGFS_FILE_OPS(sdes);
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 *counters;
+ size_t avail;
+ struct hfi1_devdata *dd;
+ ssize_t rval;
+
+ dd = private2dd(file);
+ avail = hfi1_read_cntrs(dd, NULL, &counters);
+ rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
+ return rval;
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *names;
+ size_t avail;
+ struct hfi1_devdata *dd;
+ ssize_t rval;
+
+ dd = private2dd(file);
+ avail = hfi1_read_cntrs(dd, &names, NULL);
+ rval = simple_read_from_buffer(buf, count, ppos, names, avail);
+ return rval;
+}
+
+struct counter_info {
+ char *name;
+ const struct file_operations ops;
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *names;
+ size_t avail;
+ struct hfi1_devdata *dd;
+ ssize_t rval;
+
+ dd = private2dd(file);
+ avail = hfi1_read_portcntrs(dd->pport, &names, NULL);
+ rval = simple_read_from_buffer(buf, count, ppos, names, avail);
+ return rval;
+}
+
+/* read the per-port counters */
+static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 *counters;
+ size_t avail;
+ struct hfi1_pportdata *ppd;
+ ssize_t rval;
+
+ ppd = private2ppd(file);
+ avail = hfi1_read_portcntrs(ppd, NULL, &counters);
+ rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
+ return rval;
+}
+
+static void check_dyn_flag(u64 scratch0, char *p, int size, int *used,
+ int this_hfi, int hfi, u32 flag, const char *what)
+{
+ u32 mask;
+
+ mask = flag << (hfi ? CR_DYN_SHIFT : 0);
+ if (scratch0 & mask) {
+ *used += scnprintf(p + *used, size - *used,
+ " 0x%08x - HFI%d %s in use, %s device\n",
+ mask, hfi, what,
+ this_hfi == hfi ? "this" : "other");
+ }
+}
+
+static ssize_t asic_flags_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+ u64 scratch0;
+ char *tmp;
+ int ret = 0;
+ int size;
+ int used;
+ int i;
+
+ ppd = private2ppd(file);
+ dd = ppd->dd;
+ size = PAGE_SIZE;
+ used = 0;
+ tmp = kmalloc(size, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+ used += scnprintf(tmp + used, size - used,
+ "Resource flags: 0x%016llx\n", scratch0);
+
+ /* check permanent flag */
+ if (scratch0 & CR_THERM_INIT) {
+ used += scnprintf(tmp + used, size - used,
+ " 0x%08x - thermal monitoring initialized\n",
+ (u32)CR_THERM_INIT);
+ }
+
+ /* check each dynamic flag on each HFI */
+ for (i = 0; i < 2; i++) {
+ check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+ CR_SBUS, "SBus");
+ check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+ CR_EPROM, "EPROM");
+ check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+ CR_I2C1, "i2c chain 1");
+ check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i,
+ CR_I2C2, "i2c chain 2");
+ }
+ used += scnprintf(tmp + used, size - used, "Write bits to clear\n");
+
+ ret = simple_read_from_buffer(buf, count, ppos, tmp, used);
+ kfree(tmp);
+ return ret;
+}
+
+static ssize_t asic_flags_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+ char *buff;
+ int ret;
+ unsigned long long value;
+ u64 scratch0;
+ u64 clear;
+
+ ppd = private2ppd(file);
+ dd = ppd->dd;
+
+ buff = kmalloc(count + 1, GFP_KERNEL);
+ if (!buff)
+ return -ENOMEM;
+
+ ret = copy_from_user(buff, buf, count);
+ if (ret > 0) {
+ ret = -EFAULT;
+ goto do_free;
+ }
+
+ /* zero terminate and read the expected integer */
+ buff[count] = 0;
+ ret = kstrtoull(buff, 0, &value);
+ if (ret)
+ goto do_free;
+ clear = value;
+
+ /* obtain exclusive access */
+ mutex_lock(&dd->asic_data->asic_resource_mutex);
+ acquire_hw_mutex(dd);
+
+ scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+ scratch0 &= ~clear;
+ write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+ /* force write to be visible to other HFI on another OS */
+ (void)read_csr(dd, ASIC_CFG_SCRATCH);
+
+ release_hw_mutex(dd);
+ mutex_unlock(&dd->asic_data->asic_resource_mutex);
+
+ /* return the number of bytes written */
+ ret = count;
+
+ do_free:
+ kfree(buff);
+ return ret;
+}
+
+/*
+ * read the per-port QSFP data for ppd
+ */
+static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hfi1_pportdata *ppd;
+ char *tmp;
+ int ret;
+
+ ppd = private2ppd(file);
+ tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
+ if (ret > 0)
+ ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+ kfree(tmp);
+ return ret;
+}
+
+/* Do an i2c write operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ char *buff;
+ int ret;
+ int i2c_addr;
+ int offset;
+ int total_written;
+
+ ppd = private2ppd(file);
+
+ /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
+ i2c_addr = (*ppos >> 16) & 0xffff;
+ offset = *ppos & 0xffff;
+
+ /* explicitly reject invalid address 0 to catch cp and cat */
+ if (i2c_addr == 0)
+ return -EINVAL;
+
+ buff = kmalloc(count, GFP_KERNEL);
+ if (!buff)
+ return -ENOMEM;
+
+ ret = copy_from_user(buff, buf, count);
+ if (ret > 0) {
+ ret = -EFAULT;
+ goto _free;
+ }
+
+ total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
+ if (total_written < 0) {
+ ret = total_written;
+ goto _free;
+ }
+
+ *ppos += total_written;
+
+ ret = total_written;
+
+ _free:
+ kfree(buff);
+ return ret;
+}
+
+/* Do an i2c write operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __i2c_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c write operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __i2c_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do an i2c read operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ char *buff;
+ int ret;
+ int i2c_addr;
+ int offset;
+ int total_read;
+
+ ppd = private2ppd(file);
+
+ /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */
+ i2c_addr = (*ppos >> 16) & 0xffff;
+ offset = *ppos & 0xffff;
+
+ /* explicitly reject invalid address 0 to catch cp and cat */
+ if (i2c_addr == 0)
+ return -EINVAL;
+
+ buff = kmalloc(count, GFP_KERNEL);
+ if (!buff)
+ return -ENOMEM;
+
+ total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
+ if (total_read < 0) {
+ ret = total_read;
+ goto _free;
+ }
+
+ *ppos += total_read;
+
+ ret = copy_to_user(buf, buff, total_read);
+ if (ret > 0) {
+ ret = -EFAULT;
+ goto _free;
+ }
+
+ ret = total_read;
+
+ _free:
+ kfree(buff);
+ return ret;
+}
+
+/* Do an i2c read operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __i2c_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c read operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __i2c_debugfs_read(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP write operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ char *buff;
+ int ret;
+ int total_written;
+
+ if (*ppos + count > QSFP_PAGESIZE * 4) /* base page + page00-page03 */
+ return -EINVAL;
+
+ ppd = private2ppd(file);
+
+ buff = kmalloc(count, GFP_KERNEL);
+ if (!buff)
+ return -ENOMEM;
+
+ ret = copy_from_user(buff, buf, count);
+ if (ret > 0) {
+ ret = -EFAULT;
+ goto _free;
+ }
+ total_written = qsfp_write(ppd, target, *ppos, buff, count);
+ if (total_written < 0) {
+ ret = total_written;
+ goto _free;
+ }
+
+ *ppos += total_written;
+
+ ret = total_written;
+
+ _free:
+ kfree(buff);
+ return ret;
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __qsfp_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __qsfp_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP read operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ char *buff;
+ int ret;
+ int total_read;
+
+ if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+ ret = -EINVAL;
+ goto _return;
+ }
+
+ ppd = private2ppd(file);
+
+ buff = kmalloc(count, GFP_KERNEL);
+ if (!buff) {
+ ret = -ENOMEM;
+ goto _return;
+ }
+
+ total_read = qsfp_read(ppd, target, *ppos, buff, count);
+ if (total_read < 0) {
+ ret = total_read;
+ goto _free;
+ }
+
+ *ppos += total_read;
+
+ ret = copy_to_user(buf, buff, total_read);
+ if (ret > 0) {
+ ret = -EFAULT;
+ goto _free;
+ }
+
+ ret = total_read;
+
+ _free:
+ kfree(buff);
+ _return:
+ return ret;
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __qsfp_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return __qsfp_debugfs_read(file, buf, count, ppos, 1);
+}
+
+static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ int ret;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ ppd = private2ppd(fp);
+
+ ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
+ if (ret) /* failed - release the module */
+ module_put(THIS_MODULE);
+
+ return ret;
+}
+
+static int i2c1_debugfs_open(struct inode *in, struct file *fp)
+{
+ return __i2c_debugfs_open(in, fp, 0);
+}
+
+static int i2c2_debugfs_open(struct inode *in, struct file *fp)
+{
+ return __i2c_debugfs_open(in, fp, 1);
+}
+
+static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+
+ ppd = private2ppd(fp);
+
+ release_chip_resource(ppd->dd, i2c_target(target));
+ module_put(THIS_MODULE);
+
+ return 0;
+}
+
+static int i2c1_debugfs_release(struct inode *in, struct file *fp)
+{
+ return __i2c_debugfs_release(in, fp, 0);
+}
+
+static int i2c2_debugfs_release(struct inode *in, struct file *fp)
+{
+ return __i2c_debugfs_release(in, fp, 1);
+}
+
+static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+ int ret;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ ppd = private2ppd(fp);
+
+ ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
+ if (ret) /* failed - release the module */
+ module_put(THIS_MODULE);
+
+ return ret;
+}
+
+static int qsfp1_debugfs_open(struct inode *in, struct file *fp)
+{
+ return __qsfp_debugfs_open(in, fp, 0);
+}
+
+static int qsfp2_debugfs_open(struct inode *in, struct file *fp)
+{
+ return __qsfp_debugfs_open(in, fp, 1);
+}
+
+static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target)
+{
+ struct hfi1_pportdata *ppd;
+
+ ppd = private2ppd(fp);
+
+ release_chip_resource(ppd->dd, i2c_target(target));
+ module_put(THIS_MODULE);
+
+ return 0;
+}
+
+static int qsfp1_debugfs_release(struct inode *in, struct file *fp)
+{
+ return __qsfp_debugfs_release(in, fp, 0);
+}
+
+static int qsfp2_debugfs_release(struct inode *in, struct file *fp)
+{
+ return __qsfp_debugfs_release(in, fp, 1);
+}
+
+#define DEBUGFS_OPS(nm, readroutine, writeroutine) \
+{ \
+ .name = nm, \
+ .ops = { \
+ .read = readroutine, \
+ .write = writeroutine, \
+ .llseek = generic_file_llseek, \
+ }, \
+}
+
+#define DEBUGFS_XOPS(nm, readf, writef, openf, releasef) \
+{ \
+ .name = nm, \
+ .ops = { \
+ .read = readf, \
+ .write = writef, \
+ .llseek = generic_file_llseek, \
+ .open = openf, \
+ .release = releasef \
+ }, \
+}
+
+static const struct counter_info cntr_ops[] = {
+ DEBUGFS_OPS("counter_names", dev_names_read, NULL),
+ DEBUGFS_OPS("counters", dev_counters_read, NULL),
+ DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
+};
+
+static const struct counter_info port_cntr_ops[] = {
+ DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
+ DEBUGFS_XOPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write,
+ i2c1_debugfs_open, i2c1_debugfs_release),
+ DEBUGFS_XOPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write,
+ i2c2_debugfs_open, i2c2_debugfs_release),
+ DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
+ DEBUGFS_XOPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write,
+ qsfp1_debugfs_open, qsfp1_debugfs_release),
+ DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
+ qsfp2_debugfs_open, qsfp2_debugfs_release),
+ DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
+};
+
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+ char name[sizeof("port0counters") + 1];
+ char link[10];
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+ struct hfi1_pportdata *ppd;
+ int unit = dd->unit;
+ int i, j;
+
+ if (!hfi1_dbg_root)
+ return;
+ snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
+ snprintf(link, sizeof(link), "%d", unit);
+ ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
+ if (!ibd->hfi1_ibdev_dbg) {
+ pr_warn("create of %s failed\n", name);
+ return;
+ }
+ ibd->hfi1_ibdev_link =
+ debugfs_create_symlink(link, hfi1_dbg_root, name);
+ if (!ibd->hfi1_ibdev_link) {
+ pr_warn("create of %s symlink failed\n", name);
+ return;
+ }
+ DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
+ DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
+ DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
+ DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
+ /* dev counter files */
+ for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
+ DEBUGFS_FILE_CREATE(cntr_ops[i].name,
+ ibd->hfi1_ibdev_dbg,
+ dd,
+ &cntr_ops[i].ops, S_IRUGO);
+ /* per port files */
+ for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
+ for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
+ snprintf(name,
+ sizeof(name),
+ port_cntr_ops[i].name,
+ j + 1);
+ DEBUGFS_FILE_CREATE(name,
+ ibd->hfi1_ibdev_dbg,
+ ppd,
+ &port_cntr_ops[i].ops,
+ !port_cntr_ops[i].ops.write ?
+ S_IRUGO : S_IRUGO | S_IWUSR);
+ }
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+ if (!hfi1_dbg_root)
+ goto out;
+ debugfs_remove(ibd->hfi1_ibdev_link);
+ debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
+out:
+ ibd->hfi1_ibdev_dbg = NULL;
+}
+
+/*
+ * driver stats field names, one line per stat, single string. Used by
+ * programs like hfistats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if hfi1_ib_stats changes, this needs to change. Names need to be
+ * 12 chars or less (w/o newline), for proper display by hfistats utility.
+ */
+static const char * const hfi1_statnames[] = {
+ /* must be element 0*/
+ "KernIntr",
+ "ErrorIntr",
+ "Tx_Errs",
+ "Rcv_Errs",
+ "H/W_Errs",
+ "NoPIOBufs",
+ "CtxtsOpen",
+ "RcvLen_Errs",
+ "EgrBufFull",
+ "EgrHdrFull"
+};
+
+static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
+{
+ if (*pos >= ARRAY_SIZE(hfi1_statnames))
+ return NULL;
+ return pos;
+}
+
+static void *_driver_stats_names_seq_next(
+ struct seq_file *s,
+ void *v,
+ loff_t *pos)
+{
+ ++*pos;
+ if (*pos >= ARRAY_SIZE(hfi1_statnames))
+ return NULL;
+ return pos;
+}
+
+static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
+{
+ loff_t *spos = v;
+
+ seq_printf(s, "%s\n", hfi1_statnames[*spos]);
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
+DEBUGFS_FILE_OPS(driver_stats_names);
+
+static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+ if (*pos >= ARRAY_SIZE(hfi1_statnames))
+ return NULL;
+ return pos;
+}
+
+static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ ++*pos;
+ if (*pos >= ARRAY_SIZE(hfi1_statnames))
+ return NULL;
+ return pos;
+}
+
+static void _driver_stats_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static u64 hfi1_sps_ints(void)
+{
+ unsigned long flags;
+ struct hfi1_devdata *dd;
+ u64 sps_ints = 0;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ list_for_each_entry(dd, &hfi1_dev_list, list) {
+ sps_ints += get_all_cpu_total(dd->int_counter);
+ }
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ return sps_ints;
+}
+
+static int _driver_stats_seq_show(struct seq_file *s, void *v)
+{
+ loff_t *spos = v;
+ char *buffer;
+ u64 *stats = (u64 *)&hfi1_stats;
+ size_t sz = seq_get_buf(s, &buffer);
+
+ if (sz < sizeof(u64))
+ return SEQ_SKIP;
+ /* special case for interrupts */
+ if (*spos == 0)
+ *(u64 *)buffer = hfi1_sps_ints();
+ else
+ *(u64 *)buffer = stats[*spos];
+ seq_commit(s, sizeof(u64));
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats)
+DEBUGFS_FILE_OPS(driver_stats);
+
+void hfi1_dbg_init(void)
+{
+ hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL);
+ if (!hfi1_dbg_root)
+ pr_warn("init of debugfs failed\n");
+ DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
+ DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+}
+
+void hfi1_dbg_exit(void)
+{
+ debugfs_remove_recursive(hfi1_dbg_root);
+ hfi1_dbg_root = NULL;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h
new file mode 100644
index 000000000000..b6fb6814f1b8
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/debugfs.h
@@ -0,0 +1,75 @@
+#ifndef _HFI1_DEBUGFS_H
+#define _HFI1_DEBUGFS_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_ibdev;
+#ifdef CONFIG_DEBUG_FS
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
+void hfi1_dbg_init(void);
+void hfi1_dbg_exit(void);
+#else
+static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_init(void)
+{
+}
+
+void hfi1_dbg_exit(void)
+{
+}
+
+#endif
+
+#endif /* _HFI1_DEBUGFS_H */
diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c
new file mode 100644
index 000000000000..bf64b5a7bfd7
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/device.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+
+#include "hfi.h"
+#include "device.h"
+
+static struct class *class;
+static struct class *user_class;
+static dev_t hfi1_dev;
+
+int hfi1_cdev_init(int minor, const char *name,
+ const struct file_operations *fops,
+ struct cdev *cdev, struct device **devp,
+ bool user_accessible,
+ struct kobject *parent)
+{
+ const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
+ struct device *device = NULL;
+ int ret;
+
+ cdev_init(cdev, fops);
+ cdev->owner = THIS_MODULE;
+ cdev->kobj.parent = parent;
+ kobject_set_name(&cdev->kobj, name);
+
+ ret = cdev_add(cdev, dev, 1);
+ if (ret < 0) {
+ pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+ minor, name, -ret);
+ goto done;
+ }
+
+ if (user_accessible)
+ device = device_create(user_class, NULL, dev, NULL, "%s", name);
+ else
+ device = device_create(class, NULL, dev, NULL, "%s", name);
+
+ if (IS_ERR(device)) {
+ ret = PTR_ERR(device);
+ device = NULL;
+ pr_err("Could not create device for minor %d, %s (err %d)\n",
+ minor, name, -ret);
+ cdev_del(cdev);
+ }
+done:
+ *devp = device;
+ return ret;
+}
+
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
+{
+ struct device *device = *devp;
+
+ if (device) {
+ device_unregister(device);
+ *devp = NULL;
+
+ cdev_del(cdev);
+ }
+}
+
+static const char *hfi1_class_name = "hfi1";
+
+const char *class_name(void)
+{
+ return hfi1_class_name;
+}
+
+static char *hfi1_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0600;
+ return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+static const char *hfi1_class_name_user = "hfi1_user";
+static const char *class_name_user(void)
+{
+ return hfi1_class_name_user;
+}
+
+static char *hfi1_user_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "%s", dev_name(dev));
+}
+
+int __init dev_init(void)
+{
+ int ret;
+
+ ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
+ if (ret < 0) {
+ pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+ goto done;
+ }
+
+ class = class_create(THIS_MODULE, class_name());
+ if (IS_ERR(class)) {
+ ret = PTR_ERR(class);
+ pr_err("Could not create device class (err %d)\n", -ret);
+ unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+ goto done;
+ }
+ class->devnode = hfi1_devnode;
+
+ user_class = class_create(THIS_MODULE, class_name_user());
+ if (IS_ERR(user_class)) {
+ ret = PTR_ERR(user_class);
+ pr_err("Could not create device class for user accessible files (err %d)\n",
+ -ret);
+ class_destroy(class);
+ class = NULL;
+ user_class = NULL;
+ unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+ goto done;
+ }
+ user_class->devnode = hfi1_user_devnode;
+
+done:
+ return ret;
+}
+
+void dev_cleanup(void)
+{
+ class_destroy(class);
+ class = NULL;
+
+ class_destroy(user_class);
+ user_class = NULL;
+
+ unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+}
diff --git a/drivers/infiniband/hw/hfi1/device.h b/drivers/infiniband/hw/hfi1/device.h
new file mode 100644
index 000000000000..c3ec19cb0ac9
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/device.h
@@ -0,0 +1,60 @@
+#ifndef _HFI1_DEVICE_H
+#define _HFI1_DEVICE_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+int hfi1_cdev_init(int minor, const char *name,
+ const struct file_operations *fops,
+ struct cdev *cdev, struct device **devp,
+ bool user_accessible,
+ struct kobject *parent);
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
+const char *class_name(void);
+int __init dev_init(void);
+void dev_cleanup(void);
+
+#endif /* _HFI1_DEVICE_H */
diff --git a/drivers/infiniband/hw/hfi1/dma.c b/drivers/infiniband/hw/hfi1/dma.c
new file mode 100644
index 000000000000..7e8dab892848
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/dma.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/scatterlist.h>
+
+#include "verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64)0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr,
+ size_t size, enum dma_data_direction direction)
+{
+ if (WARN_ON(!valid_dma_direction(direction)))
+ return BAD_DMA_ADDRESS;
+
+ return (u64)cpu_addr;
+}
+
+static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ /* This is a stub, nothing to be done here */
+}
+
+static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction direction)
+{
+ u64 addr;
+
+ if (WARN_ON(!valid_dma_direction(direction)))
+ return BAD_DMA_ADDRESS;
+
+ if (offset + size > PAGE_SIZE)
+ return BAD_DMA_ADDRESS;
+
+ addr = (u64)page_address(page);
+ if (addr)
+ addr += offset;
+
+ return addr;
+}
+
+static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ /* This is a stub, nothing to be done here */
+}
+
+static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction direction)
+{
+ struct scatterlist *sg;
+ u64 addr;
+ int i;
+ int ret = nents;
+
+ if (WARN_ON(!valid_dma_direction(direction)))
+ return BAD_DMA_ADDRESS;
+
+ for_each_sg(sgl, sg, nents, i) {
+ addr = (u64)page_address(sg_page(sg));
+ if (!addr) {
+ ret = 0;
+ break;
+ }
+ sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+ sg->dma_length = sg->length;
+#endif
+ }
+ return ret;
+}
+
+static void hfi1_unmap_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ /* This is a stub, nothing to be done here */
+}
+
+static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+ size_t size, enum dma_data_direction dir)
+{
+}
+
+static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+}
+
+static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size,
+ u64 *dma_handle, gfp_t flag)
+{
+ struct page *p;
+ void *addr = NULL;
+
+ p = alloc_pages(flag, get_order(size));
+ if (p)
+ addr = page_address(p);
+ if (dma_handle)
+ *dma_handle = (u64)addr;
+ return addr;
+}
+
+static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size,
+ void *cpu_addr, u64 dma_handle)
+{
+ free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops hfi1_dma_mapping_ops = {
+ .mapping_error = hfi1_mapping_error,
+ .map_single = hfi1_dma_map_single,
+ .unmap_single = hfi1_dma_unmap_single,
+ .map_page = hfi1_dma_map_page,
+ .unmap_page = hfi1_dma_unmap_page,
+ .map_sg = hfi1_map_sg,
+ .unmap_sg = hfi1_unmap_sg,
+ .sync_single_for_cpu = hfi1_sync_single_for_cpu,
+ .sync_single_for_device = hfi1_sync_single_for_device,
+ .alloc_coherent = hfi1_dma_alloc_coherent,
+ .free_coherent = hfi1_dma_free_coherent
+};
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
new file mode 100644
index 000000000000..303f10555729
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -0,0 +1,1409 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+#include <rdma/ib_verbs.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the initialization code.
+ */
+const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(hfi1_devs_lock);
+LIST_HEAD(hfi1_dev_list);
+DEFINE_MUTEX(hfi1_mutex); /* general driver use */
+
+unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify(
+ HFI1_DEFAULT_MAX_MTU));
+
+unsigned int hfi1_cu = 1;
+module_param_named(cu, hfi1_cu, uint, S_IRUGO);
+MODULE_PARM_DESC(cu, "Credit return units");
+
+unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
+static int hfi1_caps_set(const char *, const struct kernel_param *);
+static int hfi1_caps_get(char *, const struct kernel_param *);
+static const struct kernel_param_ops cap_ops = {
+ .set = hfi1_caps_set,
+ .get = hfi1_caps_get
+};
+module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
+MODULE_VERSION(HFI1_DRIVER_VERSION);
+
+/*
+ * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define MAX_PKT_RECV 64
+#define EGR_HEAD_UPDATE_THRESHOLD 16
+
+struct hfi1_ib_stats hfi1_stats;
+
+static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
+{
+ int ret = 0;
+ unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
+ cap_mask = *cap_mask_ptr, value, diff,
+ write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
+ HFI1_CAP_WRITABLE_MASK);
+
+ ret = kstrtoul(val, 0, &value);
+ if (ret) {
+ pr_warn("Invalid module parameter value for 'cap_mask'\n");
+ goto done;
+ }
+ /* Get the changed bits (except the locked bit) */
+ diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
+
+ /* Remove any bits that are not allowed to change after driver load */
+ if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
+ pr_warn("Ignoring non-writable capability bits %#lx\n",
+ diff & ~write_mask);
+ diff &= write_mask;
+ }
+
+ /* Mask off any reserved bits */
+ diff &= ~HFI1_CAP_RESERVED_MASK;
+ /* Clear any previously set and changing bits */
+ cap_mask &= ~diff;
+ /* Update the bits with the new capability */
+ cap_mask |= (value & diff);
+ /* Check for any kernel/user restrictions */
+ diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
+ ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
+ cap_mask &= ~diff;
+ /* Set the bitmask to the final set */
+ *cap_mask_ptr = cap_mask;
+done:
+ return ret;
+}
+
+static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
+{
+ unsigned long cap_mask = *(unsigned long *)kp->arg;
+
+ cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
+ cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
+
+ return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
+}
+
+const char *get_unit_name(int unit)
+{
+ static char iname[16];
+
+ snprintf(iname, sizeof(iname), DRIVER_NAME "_%u", unit);
+ return iname;
+}
+
+const char *get_card_name(struct rvt_dev_info *rdi)
+{
+ struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
+ struct hfi1_devdata *dd = container_of(ibdev,
+ struct hfi1_devdata, verbs_dev);
+ return get_unit_name(dd->unit);
+}
+
+struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi)
+{
+ struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi);
+ struct hfi1_devdata *dd = container_of(ibdev,
+ struct hfi1_devdata, verbs_dev);
+ return dd->pcidev;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int hfi1_count_active_units(void)
+{
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ unsigned long flags;
+ int pidx, nunits_active = 0;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ list_for_each_entry(dd, &hfi1_dev_list, list) {
+ if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
+ continue;
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (ppd->lid && ppd->linkup) {
+ nunits_active++;
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ return nunits_active;
+}
+
+/*
+ * Return count of all units, optionally return in arguments
+ * the number of usable (present) units, and the number of
+ * ports that are up.
+ */
+int hfi1_count_units(int *npresentp, int *nupp)
+{
+ int nunits = 0, npresent = 0, nup = 0;
+ struct hfi1_devdata *dd;
+ unsigned long flags;
+ int pidx;
+ struct hfi1_pportdata *ppd;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+ list_for_each_entry(dd, &hfi1_dev_list, list) {
+ nunits++;
+ if ((dd->flags & HFI1_PRESENT) && dd->kregbase)
+ npresent++;
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (ppd->lid && ppd->linkup)
+ nup++;
+ }
+ }
+
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+ if (npresentp)
+ *npresentp = npresent;
+ if (nupp)
+ *nupp = nup;
+
+ return nunits;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
+ u8 *update)
+{
+ u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
+
+ *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
+ return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
+ (offset * RCV_BUF_BLOCK_SIZE));
+}
+
+/*
+ * Validate and encode the a given RcvArray Buffer size.
+ * The function will check whether the given size falls within
+ * allowed size ranges for the respective type and, optionally,
+ * return the proper encoding.
+ */
+inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
+{
+ if (unlikely(!PAGE_ALIGNED(size)))
+ return 0;
+ if (unlikely(size < MIN_EAGER_BUFFER))
+ return 0;
+ if (size >
+ (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
+ return 0;
+ if (encoded)
+ *encoded = ilog2(size / PAGE_SIZE) + 1;
+ return 1;
+}
+
+static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
+ struct hfi1_packet *packet)
+{
+ struct hfi1_message_header *rhdr = packet->hdr;
+ u32 rte = rhf_rcv_type_err(packet->rhf);
+ int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct hfi1_devdata *dd = ppd->dd;
+ struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+
+ if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+ return;
+
+ if (packet->rhf & RHF_TID_ERR) {
+ /* For TIDERR and RC QPs preemptively schedule a NAK */
+ struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr;
+ struct hfi1_other_headers *ohdr = NULL;
+ u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+ u16 lid = be16_to_cpu(hdr->lrh[1]);
+ u32 qp_num;
+ u32 rcv_flags = 0;
+
+ /* Sanity check packet */
+ if (tlen < 24)
+ goto drop;
+
+ /* Check for GRH */
+ if (lnh == HFI1_LRH_BTH) {
+ ohdr = &hdr->u.oth;
+ } else if (lnh == HFI1_LRH_GRH) {
+ u32 vtf;
+
+ ohdr = &hdr->u.l.oth;
+ if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+ goto drop;
+ vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+ if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+ goto drop;
+ rcv_flags |= HFI1_HAS_GRH;
+ } else {
+ goto drop;
+ }
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+ if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+ struct rvt_qp *qp;
+ unsigned long flags;
+
+ rcu_read_lock();
+ qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!qp) {
+ rcu_read_unlock();
+ goto drop;
+ }
+
+ /*
+ * Handle only RC QPs - for other QP types drop error
+ * packet.
+ */
+ spin_lock_irqsave(&qp->r_lock, flags);
+
+ /* Check for valid receive state. */
+ if (!(ib_rvt_state_ops[qp->state] &
+ RVT_PROCESS_RECV_OK)) {
+ ibp->rvp.n_pkt_drops++;
+ }
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_RC:
+ hfi1_rc_hdrerr(
+ rcd,
+ hdr,
+ rcv_flags,
+ qp);
+ break;
+ default:
+ /* For now don't handle any other QP types */
+ break;
+ }
+
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+ rcu_read_unlock();
+ } /* Unicast QP */
+ } /* Valid packet with TIDErr */
+
+ /* handle "RcvTypeErr" flags */
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ {
+ u32 opcode;
+ void *ebuf = NULL;
+ __be32 *bth = NULL;
+
+ if (rhf_use_egr_bfr(packet->rhf))
+ ebuf = packet->ebuf;
+
+ if (!ebuf)
+ goto drop; /* this should never happen */
+
+ if (lnh == HFI1_LRH_BTH)
+ bth = (__be32 *)ebuf;
+ else if (lnh == HFI1_LRH_GRH)
+ bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
+ else
+ goto drop;
+
+ opcode = be32_to_cpu(bth[0]) >> 24;
+ opcode &= 0xff;
+
+ if (opcode == IB_OPCODE_CNP) {
+ /*
+ * Only in pre-B0 h/w is the CNP_OPCODE handled
+ * via this code path.
+ */
+ struct rvt_qp *qp = NULL;
+ u32 lqpn, rqpn;
+ u16 rlid;
+ u8 svc_type, sl, sc5;
+
+ sc5 = hdr2sc(rhdr, packet->rhf);
+ sl = ibp->sc_to_sl[sc5];
+
+ lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK;
+ rcu_read_lock();
+ qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn);
+ if (!qp) {
+ rcu_read_unlock();
+ goto drop;
+ }
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_UD:
+ rlid = 0;
+ rqpn = 0;
+ svc_type = IB_CC_SVCTYPE_UD;
+ break;
+ case IB_QPT_UC:
+ rlid = be16_to_cpu(rhdr->lrh[3]);
+ rqpn = qp->remote_qpn;
+ svc_type = IB_CC_SVCTYPE_UC;
+ break;
+ default:
+ goto drop;
+ }
+
+ process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+ rcu_read_unlock();
+ }
+
+ packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
+ break;
+ }
+ default:
+ break;
+ }
+
+drop:
+ return;
+}
+
+static inline void init_packet(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet)
+{
+ packet->rsize = rcd->rcvhdrqentsize; /* words */
+ packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
+ packet->rcd = rcd;
+ packet->updegr = 0;
+ packet->etail = -1;
+ packet->rhf_addr = get_rhf_addr(rcd);
+ packet->rhf = rhf_to_cpu(packet->rhf_addr);
+ packet->rhqoff = rcd->head;
+ packet->numpkt = 0;
+ packet->rcv_flags = 0;
+}
+
+void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+ bool do_cnp)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_ib_header *hdr = pkt->hdr;
+ struct hfi1_other_headers *ohdr = pkt->ohdr;
+ struct ib_grh *grh = NULL;
+ u32 rqpn = 0, bth1;
+ u16 rlid, dlid = be16_to_cpu(hdr->lrh[1]);
+ u8 sc, svc_type;
+ bool is_mcast = false;
+
+ if (pkt->rcv_flags & HFI1_HAS_GRH)
+ grh = &hdr->u.l.grh;
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ rlid = be16_to_cpu(hdr->lrh[3]);
+ rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
+ svc_type = IB_CC_SVCTYPE_UD;
+ is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+ (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
+ break;
+ case IB_QPT_UC:
+ rlid = qp->remote_ah_attr.dlid;
+ rqpn = qp->remote_qpn;
+ svc_type = IB_CC_SVCTYPE_UC;
+ break;
+ case IB_QPT_RC:
+ rlid = qp->remote_ah_attr.dlid;
+ rqpn = qp->remote_qpn;
+ svc_type = IB_CC_SVCTYPE_RC;
+ break;
+ default:
+ return;
+ }
+
+ sc = hdr2sc((struct hfi1_message_header *)hdr, pkt->rhf);
+
+ bth1 = be32_to_cpu(ohdr->bth[1]);
+ if (do_cnp && (bth1 & HFI1_FECN_SMASK)) {
+ u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+
+ return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh);
+ }
+
+ if (!is_mcast && (bth1 & HFI1_BECN_SMASK)) {
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 lqpn = bth1 & RVT_QPN_MASK;
+ u8 sl = ibp->sc_to_sl[sc];
+
+ process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+ }
+
+}
+
+struct ps_mdata {
+ struct hfi1_ctxtdata *rcd;
+ u32 rsize;
+ u32 maxcnt;
+ u32 ps_head;
+ u32 ps_tail;
+ u32 ps_seq;
+};
+
+static inline void init_ps_mdata(struct ps_mdata *mdata,
+ struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
+ mdata->rcd = rcd;
+ mdata->rsize = packet->rsize;
+ mdata->maxcnt = packet->maxcnt;
+ mdata->ps_head = packet->rhqoff;
+
+ if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+ mdata->ps_tail = get_rcvhdrtail(rcd);
+ if (rcd->ctxt == HFI1_CTRL_CTXT)
+ mdata->ps_seq = rcd->seq_cnt;
+ else
+ mdata->ps_seq = 0; /* not used with DMA_RTAIL */
+ } else {
+ mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
+ mdata->ps_seq = rcd->seq_cnt;
+ }
+}
+
+static inline int ps_done(struct ps_mdata *mdata, u64 rhf,
+ struct hfi1_ctxtdata *rcd)
+{
+ if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+ return mdata->ps_head == mdata->ps_tail;
+ return mdata->ps_seq != rhf_rcv_seq(rhf);
+}
+
+static inline int ps_skip(struct ps_mdata *mdata, u64 rhf,
+ struct hfi1_ctxtdata *rcd)
+{
+ /*
+ * Control context can potentially receive an invalid rhf.
+ * Drop such packets.
+ */
+ if ((rcd->ctxt == HFI1_CTRL_CTXT) && (mdata->ps_head != mdata->ps_tail))
+ return mdata->ps_seq != rhf_rcv_seq(rhf);
+
+ return 0;
+}
+
+static inline void update_ps_mdata(struct ps_mdata *mdata,
+ struct hfi1_ctxtdata *rcd)
+{
+ mdata->ps_head += mdata->rsize;
+ if (mdata->ps_head >= mdata->maxcnt)
+ mdata->ps_head = 0;
+
+ /* Control context must do seq counting */
+ if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
+ (rcd->ctxt == HFI1_CTRL_CTXT)) {
+ if (++mdata->ps_seq > 13)
+ mdata->ps_seq = 1;
+ }
+}
+
+/*
+ * prescan_rxq - search through the receive queue looking for packets
+ * containing Excplicit Congestion Notifications (FECNs, or BECNs).
+ * When an ECN is found, process the Congestion Notification, and toggle
+ * it off.
+ * This is declared as a macro to allow quick checking of the port to avoid
+ * the overhead of a function call if not enabled.
+ */
+#define prescan_rxq(rcd, packet) \
+ do { \
+ if (rcd->ppd->cc_prescan) \
+ __prescan_rxq(packet); \
+ } while (0)
+static void __prescan_rxq(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct ps_mdata mdata;
+
+ init_ps_mdata(&mdata, packet);
+
+ while (1) {
+ struct hfi1_devdata *dd = rcd->dd;
+ struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
+ __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
+ dd->rhf_offset;
+ struct rvt_qp *qp;
+ struct hfi1_ib_header *hdr;
+ struct hfi1_other_headers *ohdr;
+ struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+ u64 rhf = rhf_to_cpu(rhf_addr);
+ u32 etype = rhf_rcv_type(rhf), qpn, bth1;
+ int is_ecn = 0;
+ u8 lnh;
+
+ if (ps_done(&mdata, rhf, rcd))
+ break;
+
+ if (ps_skip(&mdata, rhf, rcd))
+ goto next;
+
+ if (etype != RHF_RCV_TYPE_IB)
+ goto next;
+
+ hdr = (struct hfi1_ib_header *)
+ hfi1_get_msgheader(dd, rhf_addr);
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+
+ if (lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else if (lnh == HFI1_LRH_GRH)
+ ohdr = &hdr->u.l.oth;
+ else
+ goto next; /* just in case */
+
+ bth1 = be32_to_cpu(ohdr->bth[1]);
+ is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK));
+
+ if (!is_ecn)
+ goto next;
+
+ qpn = bth1 & RVT_QPN_MASK;
+ rcu_read_lock();
+ qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
+
+ if (!qp) {
+ rcu_read_unlock();
+ goto next;
+ }
+
+ process_ecn(qp, packet, true);
+ rcu_read_unlock();
+
+ /* turn off BECN, FECN */
+ bth1 &= ~(HFI1_FECN_SMASK | HFI1_BECN_SMASK);
+ ohdr->bth[1] = cpu_to_be32(bth1);
+next:
+ update_ps_mdata(&mdata, rcd);
+ }
+}
+
+static inline int skip_rcv_packet(struct hfi1_packet *packet, int thread)
+{
+ int ret = RCV_PKT_OK;
+
+ /* Set up for the next packet */
+ packet->rhqoff += packet->rsize;
+ if (packet->rhqoff >= packet->maxcnt)
+ packet->rhqoff = 0;
+
+ packet->numpkt++;
+ if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
+ if (thread) {
+ cond_resched();
+ } else {
+ ret = RCV_PKT_LIMIT;
+ this_cpu_inc(*packet->rcd->dd->rcv_limit);
+ }
+ }
+
+ packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
+ packet->rcd->dd->rhf_offset;
+ packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+ return ret;
+}
+
+static inline int process_rcv_packet(struct hfi1_packet *packet, int thread)
+{
+ int ret = RCV_PKT_OK;
+
+ packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
+ packet->rhf_addr);
+ packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+ packet->etype = rhf_rcv_type(packet->rhf);
+ /* total length */
+ packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+ /* retrieve eager buffer details */
+ packet->ebuf = NULL;
+ if (rhf_use_egr_bfr(packet->rhf)) {
+ packet->etail = rhf_egr_index(packet->rhf);
+ packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
+ &packet->updegr);
+ /*
+ * Prefetch the contents of the eager buffer. It is
+ * OK to send a negative length to prefetch_range().
+ * The +2 is the size of the RHF.
+ */
+ prefetch_range(packet->ebuf,
+ packet->tlen - ((packet->rcd->rcvhdrqentsize -
+ (rhf_hdrq_offset(packet->rhf)
+ + 2)) * 4));
+ }
+
+ /*
+ * Call a type specific handler for the packet. We
+ * should be able to trust that etype won't be beyond
+ * the range of valid indexes. If so something is really
+ * wrong and we can probably just let things come
+ * crashing down. There is no need to eat another
+ * comparison in this performance critical code.
+ */
+ packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
+ packet->numpkt++;
+
+ /* Set up for the next packet */
+ packet->rhqoff += packet->rsize;
+ if (packet->rhqoff >= packet->maxcnt)
+ packet->rhqoff = 0;
+
+ if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) {
+ if (thread) {
+ cond_resched();
+ } else {
+ ret = RCV_PKT_LIMIT;
+ this_cpu_inc(*packet->rcd->dd->rcv_limit);
+ }
+ }
+
+ packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
+ packet->rcd->dd->rhf_offset;
+ packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+ return ret;
+}
+
+static inline void process_rcv_update(int last, struct hfi1_packet *packet)
+{
+ /*
+ * Update head regs etc., every 16 packets, if not last pkt,
+ * to help prevent rcvhdrq overflows, when many packets
+ * are processed and queue is nearly full.
+ * Don't request an interrupt for intermediate updates.
+ */
+ if (!last && !(packet->numpkt & 0xf)) {
+ update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
+ packet->etail, 0, 0);
+ packet->updegr = 0;
+ }
+ packet->rcv_flags = 0;
+}
+
+static inline void finish_packet(struct hfi1_packet *packet)
+{
+ /*
+ * Nothing we need to free for the packet.
+ *
+ * The only thing we need to do is a final update and call for an
+ * interrupt
+ */
+ update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
+ packet->etail, rcv_intr_dynamic, packet->numpkt);
+}
+
+static inline void process_rcv_qp_work(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd;
+ struct rvt_qp *qp, *nqp;
+
+ rcd = packet->rcd;
+ rcd->head = packet->rhqoff;
+
+ /*
+ * Iterate over all QPs waiting to respond.
+ * The list won't change since the IRQ is only run on one CPU.
+ */
+ list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+ list_del_init(&qp->rspwait);
+ if (qp->r_flags & RVT_R_RSP_NAK) {
+ qp->r_flags &= ~RVT_R_RSP_NAK;
+ hfi1_send_rc_ack(rcd, qp, 0);
+ }
+ if (qp->r_flags & RVT_R_RSP_SEND) {
+ unsigned long flags;
+
+ qp->r_flags &= ~RVT_R_RSP_SEND;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_rvt_state_ops[qp->state] &
+ RVT_PROCESS_OR_FLUSH_SEND)
+ hfi1_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+}
+
+/*
+ * Handle receive interrupts when using the no dma rtail option.
+ */
+int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread)
+{
+ u32 seq;
+ int last = RCV_PKT_OK;
+ struct hfi1_packet packet;
+
+ init_packet(rcd, &packet);
+ seq = rhf_rcv_seq(packet.rhf);
+ if (seq != rcd->seq_cnt) {
+ last = RCV_PKT_DONE;
+ goto bail;
+ }
+
+ prescan_rxq(rcd, &packet);
+
+ while (last == RCV_PKT_OK) {
+ last = process_rcv_packet(&packet, thread);
+ seq = rhf_rcv_seq(packet.rhf);
+ if (++rcd->seq_cnt > 13)
+ rcd->seq_cnt = 1;
+ if (seq != rcd->seq_cnt)
+ last = RCV_PKT_DONE;
+ process_rcv_update(last, &packet);
+ }
+ process_rcv_qp_work(&packet);
+bail:
+ finish_packet(&packet);
+ return last;
+}
+
+int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread)
+{
+ u32 hdrqtail;
+ int last = RCV_PKT_OK;
+ struct hfi1_packet packet;
+
+ init_packet(rcd, &packet);
+ hdrqtail = get_rcvhdrtail(rcd);
+ if (packet.rhqoff == hdrqtail) {
+ last = RCV_PKT_DONE;
+ goto bail;
+ }
+ smp_rmb(); /* prevent speculative reads of dma'ed hdrq */
+
+ prescan_rxq(rcd, &packet);
+
+ while (last == RCV_PKT_OK) {
+ last = process_rcv_packet(&packet, thread);
+ if (packet.rhqoff == hdrqtail)
+ last = RCV_PKT_DONE;
+ process_rcv_update(last, &packet);
+ }
+ process_rcv_qp_work(&packet);
+bail:
+ finish_packet(&packet);
+ return last;
+}
+
+static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+ dd->rcd[i]->do_interrupt =
+ &handle_receive_interrupt_nodma_rtail;
+}
+
+static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+ dd->rcd[i]->do_interrupt =
+ &handle_receive_interrupt_dma_rtail;
+}
+
+void set_all_slowpath(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */
+ for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++)
+ dd->rcd[i]->do_interrupt = &handle_receive_interrupt;
+}
+
+static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet,
+ struct hfi1_devdata *dd)
+{
+ struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
+ struct hfi1_message_header *hdr = hfi1_get_msgheader(packet->rcd->dd,
+ packet->rhf_addr);
+ u8 etype = rhf_rcv_type(packet->rhf);
+
+ if (etype == RHF_RCV_TYPE_IB && hdr2sc(hdr, packet->rhf) != 0xf) {
+ int hwstate = read_logical_state(dd);
+
+ if (hwstate != LSTATE_ACTIVE) {
+ dd_dev_info(dd, "Unexpected link state %d\n", hwstate);
+ return 0;
+ }
+
+ queue_work(rcd->ppd->hfi1_wq, lsaw);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * handle_receive_interrupt - receive a packet
+ * @rcd: the context
+ *
+ * Called from interrupt handler for errors or receive interrupt.
+ * This is the slow path interrupt handler.
+ */
+int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 hdrqtail;
+ int needset, last = RCV_PKT_OK;
+ struct hfi1_packet packet;
+ int skip_pkt = 0;
+
+ /* Control context will always use the slow path interrupt handler */
+ needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1;
+
+ init_packet(rcd, &packet);
+
+ if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+ u32 seq = rhf_rcv_seq(packet.rhf);
+
+ if (seq != rcd->seq_cnt) {
+ last = RCV_PKT_DONE;
+ goto bail;
+ }
+ hdrqtail = 0;
+ } else {
+ hdrqtail = get_rcvhdrtail(rcd);
+ if (packet.rhqoff == hdrqtail) {
+ last = RCV_PKT_DONE;
+ goto bail;
+ }
+ smp_rmb(); /* prevent speculative reads of dma'ed hdrq */
+
+ /*
+ * Control context can potentially receive an invalid
+ * rhf. Drop such packets.
+ */
+ if (rcd->ctxt == HFI1_CTRL_CTXT) {
+ u32 seq = rhf_rcv_seq(packet.rhf);
+
+ if (seq != rcd->seq_cnt)
+ skip_pkt = 1;
+ }
+ }
+
+ prescan_rxq(rcd, &packet);
+
+ while (last == RCV_PKT_OK) {
+ if (unlikely(dd->do_drop &&
+ atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
+ DROP_PACKET_ON)) {
+ dd->do_drop = 0;
+
+ /* On to the next packet */
+ packet.rhqoff += packet.rsize;
+ packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
+ packet.rhqoff +
+ dd->rhf_offset;
+ packet.rhf = rhf_to_cpu(packet.rhf_addr);
+
+ } else if (skip_pkt) {
+ last = skip_rcv_packet(&packet, thread);
+ skip_pkt = 0;
+ } else {
+ /* Auto activate link on non-SC15 packet receive */
+ if (unlikely(rcd->ppd->host_link_state ==
+ HLS_UP_ARMED) &&
+ set_armed_to_active(rcd, &packet, dd))
+ goto bail;
+ last = process_rcv_packet(&packet, thread);
+ }
+
+ if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+ u32 seq = rhf_rcv_seq(packet.rhf);
+
+ if (++rcd->seq_cnt > 13)
+ rcd->seq_cnt = 1;
+ if (seq != rcd->seq_cnt)
+ last = RCV_PKT_DONE;
+ if (needset) {
+ dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
+ set_all_nodma_rtail(dd);
+ needset = 0;
+ }
+ } else {
+ if (packet.rhqoff == hdrqtail)
+ last = RCV_PKT_DONE;
+ /*
+ * Control context can potentially receive an invalid
+ * rhf. Drop such packets.
+ */
+ if (rcd->ctxt == HFI1_CTRL_CTXT) {
+ u32 seq = rhf_rcv_seq(packet.rhf);
+
+ if (++rcd->seq_cnt > 13)
+ rcd->seq_cnt = 1;
+ if (!last && (seq != rcd->seq_cnt))
+ skip_pkt = 1;
+ }
+
+ if (needset) {
+ dd_dev_info(dd,
+ "Switching to DMA_RTAIL\n");
+ set_all_dma_rtail(dd);
+ needset = 0;
+ }
+ }
+
+ process_rcv_update(last, &packet);
+ }
+
+ process_rcv_qp_work(&packet);
+
+bail:
+ /*
+ * Always write head at end, and setup rcv interrupt, even
+ * if no packets were processed.
+ */
+ finish_packet(&packet);
+ return last;
+}
+
+/*
+ * We may discover in the interrupt that the hardware link state has
+ * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet),
+ * and we need to update the driver's notion of the link state. We cannot
+ * run set_link_state from interrupt context, so we queue this function on
+ * a workqueue.
+ *
+ * We delay the regular interrupt processing until after the state changes
+ * so that the link will be in the correct state by the time any application
+ * we wake up attempts to send a reply to any message it received.
+ * (Subsequent receive interrupts may possibly force the wakeup before we
+ * update the link state.)
+ *
+ * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes
+ * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues,
+ * so we're safe from use-after-free of the rcd.
+ */
+void receive_interrupt_work(struct work_struct *work)
+{
+ struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+ linkstate_active_work);
+ struct hfi1_devdata *dd = ppd->dd;
+ int i;
+
+ /* Received non-SC15 packet implies neighbor_normal */
+ ppd->neighbor_normal = 1;
+ set_link_state(ppd, HLS_UP_ACTIVE);
+
+ /*
+ * Interrupt all kernel contexts that could have had an
+ * interrupt during auto activation.
+ */
+ for (i = HFI1_CTRL_CTXT; i < dd->first_user_ctxt; i++)
+ force_recv_intr(dd->rcd[i]);
+}
+
+/*
+ * Convert a given MTU size to the on-wire MAD packet enumeration.
+ * Return -1 if the size is invalid.
+ */
+int mtu_to_enum(u32 mtu, int default_if_bad)
+{
+ switch (mtu) {
+ case 0: return OPA_MTU_0;
+ case 256: return OPA_MTU_256;
+ case 512: return OPA_MTU_512;
+ case 1024: return OPA_MTU_1024;
+ case 2048: return OPA_MTU_2048;
+ case 4096: return OPA_MTU_4096;
+ case 8192: return OPA_MTU_8192;
+ case 10240: return OPA_MTU_10240;
+ }
+ return default_if_bad;
+}
+
+u16 enum_to_mtu(int mtu)
+{
+ switch (mtu) {
+ case OPA_MTU_0: return 0;
+ case OPA_MTU_256: return 256;
+ case OPA_MTU_512: return 512;
+ case OPA_MTU_1024: return 1024;
+ case OPA_MTU_2048: return 2048;
+ case OPA_MTU_4096: return 4096;
+ case OPA_MTU_8192: return 8192;
+ case OPA_MTU_10240: return 10240;
+ default: return 0xffff;
+ }
+}
+
+/*
+ * set_mtu - set the MTU
+ * @ppd: the per port data
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size. We do not deal with what happens
+ * to programs that are already running when the size changes.
+ */
+int set_mtu(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ int i, drain, ret = 0, is_up = 0;
+
+ ppd->ibmtu = 0;
+ for (i = 0; i < ppd->vls_supported; i++)
+ if (ppd->ibmtu < dd->vld[i].mtu)
+ ppd->ibmtu = dd->vld[i].mtu;
+ ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
+
+ mutex_lock(&ppd->hls_lock);
+ if (ppd->host_link_state == HLS_UP_INIT ||
+ ppd->host_link_state == HLS_UP_ARMED ||
+ ppd->host_link_state == HLS_UP_ACTIVE)
+ is_up = 1;
+
+ drain = !is_ax(dd) && is_up;
+
+ if (drain)
+ /*
+ * MTU is specified per-VL. To ensure that no packet gets
+ * stuck (due, e.g., to the MTU for the packet's VL being
+ * reduced), empty the per-VL FIFOs before adjusting MTU.
+ */
+ ret = stop_drain_data_vls(dd);
+
+ if (ret) {
+ dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
+ __func__);
+ goto err;
+ }
+
+ hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
+
+ if (drain)
+ open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+ mutex_unlock(&ppd->hls_lock);
+
+ return ret;
+}
+
+int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ ppd->lid = lid;
+ ppd->lmc = lmc;
+ hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
+
+ dd_dev_info(dd, "port %u: got a lid: 0x%x\n", ppd->port, lid);
+
+ return 0;
+}
+
+void shutdown_led_override(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ /*
+ * This pairs with the memory barrier in hfi1_start_led_override to
+ * ensure that we read the correct state of LED beaconing represented
+ * by led_override_timer_active
+ */
+ smp_rmb();
+ if (atomic_read(&ppd->led_override_timer_active)) {
+ del_timer_sync(&ppd->led_override_timer);
+ atomic_set(&ppd->led_override_timer_active, 0);
+ /* Ensure the atomic_set is visible to all CPUs */
+ smp_wmb();
+ }
+
+ /* Hand control of the LED to the DC for normal operation */
+ write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+}
+
+static void run_led_override(unsigned long opaque)
+{
+ struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
+ struct hfi1_devdata *dd = ppd->dd;
+ unsigned long timeout;
+ int phase_idx;
+
+ if (!(dd->flags & HFI1_INITTED))
+ return;
+
+ phase_idx = ppd->led_override_phase & 1;
+
+ setextled(dd, phase_idx);
+
+ timeout = ppd->led_override_vals[phase_idx];
+
+ /* Set up for next phase */
+ ppd->led_override_phase = !ppd->led_override_phase;
+
+ mod_timer(&ppd->led_override_timer, jiffies + timeout);
+}
+
+/*
+ * To have the LED blink in a particular pattern, provide timeon and timeoff
+ * in milliseconds.
+ * To turn off custom blinking and return to normal operation, use
+ * shutdown_led_override()
+ */
+void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
+ unsigned int timeoff)
+{
+ if (!(ppd->dd->flags & HFI1_INITTED))
+ return;
+
+ /* Convert to jiffies for direct use in timer */
+ ppd->led_override_vals[0] = msecs_to_jiffies(timeoff);
+ ppd->led_override_vals[1] = msecs_to_jiffies(timeon);
+
+ /* Arbitrarily start from LED on phase */
+ ppd->led_override_phase = 1;
+
+ /*
+ * If the timer has not already been started, do so. Use a "quick"
+ * timeout so the handler will be called soon to look at our request.
+ */
+ if (!timer_pending(&ppd->led_override_timer)) {
+ setup_timer(&ppd->led_override_timer, run_led_override,
+ (unsigned long)ppd);
+ ppd->led_override_timer.expires = jiffies + 1;
+ add_timer(&ppd->led_override_timer);
+ atomic_set(&ppd->led_override_timer_active, 1);
+ /* Ensure the atomic_set is visible to all CPUs */
+ smp_wmb();
+ }
+}
+
+/**
+ * hfi1_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload). We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize. For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int hfi1_reset_device(int unit)
+{
+ int ret, i;
+ struct hfi1_devdata *dd = hfi1_lookup(unit);
+ struct hfi1_pportdata *ppd;
+ unsigned long flags;
+ int pidx;
+
+ if (!dd) {
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ dd_dev_info(dd, "Reset on unit %u requested\n", unit);
+
+ if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
+ dd_dev_info(dd,
+ "Invalid unit number %u or not initialized or not present\n",
+ unit);
+ ret = -ENXIO;
+ goto bail;
+ }
+
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ if (dd->rcd)
+ for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+ if (!dd->rcd[i] || !dd->rcd[i]->cnt)
+ continue;
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+ ret = -EBUSY;
+ goto bail;
+ }
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+
+ shutdown_led_override(ppd);
+ }
+ if (dd->flags & HFI1_HAS_SEND_DMA)
+ sdma_exit(dd);
+
+ hfi1_reset_cpu_counters(dd);
+
+ ret = hfi1_init(dd, 1);
+
+ if (ret)
+ dd_dev_err(dd,
+ "Reinitialize unit %u after reset failed with %d\n",
+ unit, ret);
+ else
+ dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
+ unit);
+
+bail:
+ return ret;
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ u32 rte = rhf_rcv_type_err(packet->rhf);
+
+ rcv_hdrerr(rcd, rcd->ppd, packet);
+ if (rhf_err_flags(packet->rhf))
+ dd_dev_err(rcd->dd,
+ "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+ rcd->ctxt, packet->rhf,
+ packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+ packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+ packet->rhf & RHF_DC_ERR ? "dc " : "",
+ packet->rhf & RHF_TID_ERR ? "tid " : "",
+ packet->rhf & RHF_LEN_ERR ? "len " : "",
+ packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+ packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+ packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+ rte);
+}
+
+/*
+ * The following functions are called by the interrupt handler. They are type
+ * specific handlers for each packet type.
+ */
+int process_receive_ib(struct hfi1_packet *packet)
+{
+ trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
+ packet->rcd->ctxt,
+ rhf_err_flags(packet->rhf),
+ RHF_RCV_TYPE_IB,
+ packet->hlen,
+ packet->tlen,
+ packet->updegr,
+ rhf_egr_index(packet->rhf));
+
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ handle_eflags(packet);
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_ib_rcv(packet);
+ return RHF_RCV_CONTINUE;
+}
+
+int process_receive_bypass(struct hfi1_packet *packet)
+{
+ if (unlikely(rhf_err_flags(packet->rhf)))
+ handle_eflags(packet);
+
+ dd_dev_err(packet->rcd->dd,
+ "Bypass packets are not supported in normal operation. Dropping\n");
+ incr_cntr64(&packet->rcd->dd->sw_rcv_bypass_packet_errors);
+ return RHF_RCV_CONTINUE;
+}
+
+int process_receive_error(struct hfi1_packet *packet)
+{
+ handle_eflags(packet);
+
+ if (unlikely(rhf_err_flags(packet->rhf)))
+ dd_dev_err(packet->rcd->dd,
+ "Unhandled error packet received. Dropping.\n");
+
+ return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_expected(struct hfi1_packet *packet)
+{
+ if (unlikely(rhf_err_flags(packet->rhf)))
+ handle_eflags(packet);
+
+ dd_dev_err(packet->rcd->dd,
+ "Unhandled expected packet received. Dropping.\n");
+ return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_eager(struct hfi1_packet *packet)
+{
+ if (unlikely(rhf_err_flags(packet->rhf)))
+ handle_eflags(packet);
+
+ dd_dev_err(packet->rcd->dd,
+ "Unhandled eager packet received. Dropping.\n");
+ return RHF_RCV_CONTINUE;
+}
+
+int process_receive_invalid(struct hfi1_packet *packet)
+{
+ dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
+ rhf_rcv_type(packet->rhf));
+ return RHF_RCV_CONTINUE;
+}
diff --git a/drivers/infiniband/hw/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c
new file mode 100644
index 000000000000..106349fc1fb9
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/efivar.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "efivar.h"
+
+/* GUID for HFI1 variables in EFI */
+#define HFI1_EFIVAR_GUID EFI_GUID(0xc50a953e, 0xa8b2, 0x42a6, \
+ 0xbf, 0x89, 0xd3, 0x33, 0xa6, 0xe9, 0xe6, 0xd4)
+/* largest EFI data size we expect */
+#define EFI_DATA_SIZE 4096
+
+/*
+ * Read the named EFI variable. Return the size of the actual data in *size
+ * and a kmalloc'ed buffer in *return_data. The caller must free the
+ * data. It is guaranteed that *return_data will be NULL and *size = 0
+ * if this routine fails.
+ *
+ * Return 0 on success, -errno on failure.
+ */
+static int read_efi_var(const char *name, unsigned long *size,
+ void **return_data)
+{
+ efi_status_t status;
+ efi_char16_t *uni_name;
+ efi_guid_t guid;
+ unsigned long temp_size;
+ void *temp_buffer;
+ void *data;
+ int i;
+ int ret;
+
+ /* set failure return values */
+ *size = 0;
+ *return_data = NULL;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return -EOPNOTSUPP;
+
+ uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL);
+ temp_buffer = kzalloc(EFI_DATA_SIZE, GFP_KERNEL);
+
+ if (!uni_name || !temp_buffer) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ /* input: the size of the buffer */
+ temp_size = EFI_DATA_SIZE;
+
+ /* convert ASCII to unicode - it is a 1:1 mapping */
+ for (i = 0; name[i]; i++)
+ uni_name[i] = name[i];
+
+ /* need a variable for our GUID */
+ guid = HFI1_EFIVAR_GUID;
+
+ /* call into EFI runtime services */
+ status = efi.get_variable(
+ uni_name,
+ &guid,
+ NULL,
+ &temp_size,
+ temp_buffer);
+
+ /*
+ * It would be nice to call efi_status_to_err() here, but that
+ * is in the EFIVAR_FS code and may not be compiled in.
+ * However, even that is insufficient since it does not cover
+ * EFI_BUFFER_TOO_SMALL which could be an important return.
+ * For now, just split out succces or not found.
+ */
+ ret = status == EFI_SUCCESS ? 0 :
+ status == EFI_NOT_FOUND ? -ENOENT :
+ -EINVAL;
+ if (ret)
+ goto fail;
+
+ /*
+ * We have successfully read the EFI variable into our
+ * temporary buffer. Now allocate a correctly sized
+ * buffer.
+ */
+ data = kmemdup(temp_buffer, temp_size, GFP_KERNEL);
+ if (!data) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ *size = temp_size;
+ *return_data = data;
+
+fail:
+ kfree(uni_name);
+ kfree(temp_buffer);
+
+ return ret;
+}
+
+/*
+ * Read an HFI1 EFI variable of the form:
+ * <PCIe address>-<kind>
+ * Return an kalloc'ed array and size of the data.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
+ unsigned long *size, void **return_data)
+{
+ char name[64];
+
+ /* create a common prefix */
+ snprintf(name, sizeof(name), "%04x:%02x:%02x.%x-%s",
+ pci_domain_nr(dd->pcidev->bus),
+ dd->pcidev->bus->number,
+ PCI_SLOT(dd->pcidev->devfn),
+ PCI_FUNC(dd->pcidev->devfn),
+ kind);
+
+ return read_efi_var(name, size, return_data);
+}
diff --git a/drivers/infiniband/hw/hfi1/efivar.h b/drivers/infiniband/hw/hfi1/efivar.h
new file mode 100644
index 000000000000..94e9e70de568
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/efivar.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_EFIVAR_H
+#define _HFI1_EFIVAR_H
+
+#include <linux/efi.h>
+
+#include "hfi.h"
+
+int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind,
+ unsigned long *size, void **return_data);
+
+#endif /* _HFI1_EFIVAR_H */
diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c
new file mode 100644
index 000000000000..36b77943cbfd
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/eprom.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/delay.h>
+#include "hfi.h"
+#include "common.h"
+#include "eprom.h"
+
+#define CMD_SHIFT 24
+#define CMD_RELEASE_POWERDOWN_NOID ((0xab << CMD_SHIFT))
+
+/* controller interface speeds */
+#define EP_SPEED_FULL 0x2 /* full speed */
+
+/*
+ * How long to wait for the EPROM to become available, in ms.
+ * The spec 32 Mb EPROM takes around 40s to erase then write.
+ * Double it for safety.
+ */
+#define EPROM_TIMEOUT 80000 /* ms */
+/*
+ * Initialize the EPROM handler.
+ */
+int eprom_init(struct hfi1_devdata *dd)
+{
+ int ret = 0;
+
+ /* only the discrete chip has an EPROM */
+ if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
+ return 0;
+
+ /*
+ * It is OK if both HFIs reset the EPROM as long as they don't
+ * do it at the same time.
+ */
+ ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT);
+ if (ret) {
+ dd_dev_err(dd,
+ "%s: unable to acquire EPROM resource, no EPROM support\n",
+ __func__);
+ goto done_asic;
+ }
+
+ /* reset EPROM to be sure it is in a good state */
+
+ /* set reset */
+ write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
+ /* clear reset, set speed */
+ write_csr(dd, ASIC_EEP_CTL_STAT,
+ EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
+
+ /* wake the device with command "release powerdown NoID" */
+ write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
+
+ dd->eprom_available = true;
+ release_chip_resource(dd, CR_EPROM);
+done_asic:
+ return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/eprom.h b/drivers/infiniband/hw/hfi1/eprom.h
new file mode 100644
index 000000000000..d41f0b1afb15
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/eprom.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_cmd;
+struct hfi1_devdata;
+
+int eprom_init(struct hfi1_devdata *dd);
+int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd);
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
new file mode 100644
index 000000000000..7e03ccd2554d
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -0,0 +1,1514 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/vmalloc.h>
+#include <linux/io.h>
+
+#include <rdma/ib.h>
+
+#include "hfi.h"
+#include "pio.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "user_sdma.h"
+#include "user_exp_rcv.h"
+#include "eprom.h"
+#include "aspm.h"
+#include "mmu_rb.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
+
+/*
+ * File operation functions
+ */
+static int hfi1_file_open(struct inode *, struct file *);
+static int hfi1_file_close(struct inode *, struct file *);
+static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
+static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
+static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
+
+static u64 kvirt_to_phys(void *);
+static int assign_ctxt(struct file *, struct hfi1_user_info *);
+static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
+static int user_init(struct file *);
+static int get_ctxt_info(struct file *, void __user *, __u32);
+static int get_base_info(struct file *, void __user *, __u32);
+static int setup_ctxt(struct file *);
+static int setup_subctxt(struct hfi1_ctxtdata *);
+static int get_user_context(struct file *, struct hfi1_user_info *, int);
+static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
+static int allocate_ctxt(struct file *, struct hfi1_devdata *,
+ struct hfi1_user_info *);
+static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
+static unsigned int poll_next(struct file *, struct poll_table_struct *);
+static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
+static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
+static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
+static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+ unsigned long arg);
+
+static const struct file_operations hfi1_file_ops = {
+ .owner = THIS_MODULE,
+ .write_iter = hfi1_write_iter,
+ .open = hfi1_file_open,
+ .release = hfi1_file_close,
+ .unlocked_ioctl = hfi1_file_ioctl,
+ .poll = hfi1_poll,
+ .mmap = hfi1_file_mmap,
+ .llseek = noop_llseek,
+};
+
+static struct vm_operations_struct vm_ops = {
+ .fault = vma_fault,
+};
+
+/*
+ * Types of memories mapped into user processes' space
+ */
+enum mmap_types {
+ PIO_BUFS = 1,
+ PIO_BUFS_SOP,
+ PIO_CRED,
+ RCV_HDRQ,
+ RCV_EGRBUF,
+ UREGS,
+ EVENTS,
+ STATUS,
+ RTAIL,
+ SUBCTXT_UREGS,
+ SUBCTXT_RCV_HDRQ,
+ SUBCTXT_EGRBUF,
+ SDMA_COMP
+};
+
+/*
+ * Masks and offsets defining the mmap tokens
+ */
+#define HFI1_MMAP_OFFSET_MASK 0xfffULL
+#define HFI1_MMAP_OFFSET_SHIFT 0
+#define HFI1_MMAP_SUBCTXT_MASK 0xfULL
+#define HFI1_MMAP_SUBCTXT_SHIFT 12
+#define HFI1_MMAP_CTXT_MASK 0xffULL
+#define HFI1_MMAP_CTXT_SHIFT 16
+#define HFI1_MMAP_TYPE_MASK 0xfULL
+#define HFI1_MMAP_TYPE_SHIFT 24
+#define HFI1_MMAP_MAGIC_MASK 0xffffffffULL
+#define HFI1_MMAP_MAGIC_SHIFT 32
+
+#define HFI1_MMAP_MAGIC 0xdabbad00
+
+#define HFI1_MMAP_TOKEN_SET(field, val) \
+ (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
+#define HFI1_MMAP_TOKEN_GET(field, token) \
+ (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
+#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr) \
+ (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
+ HFI1_MMAP_TOKEN_SET(TYPE, type) | \
+ HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
+ HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
+ HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
+
+#define dbg(fmt, ...) \
+ pr_info(fmt, ##__VA_ARGS__)
+
+static inline int is_valid_mmap(u64 token)
+{
+ return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
+}
+
+static int hfi1_file_open(struct inode *inode, struct file *fp)
+{
+ struct hfi1_filedata *fd;
+ struct hfi1_devdata *dd = container_of(inode->i_cdev,
+ struct hfi1_devdata,
+ user_cdev);
+
+ /* Just take a ref now. Not all opens result in a context assign */
+ kobject_get(&dd->kobj);
+
+ /* The real work is performed later in assign_ctxt() */
+
+ fd = kzalloc(sizeof(*fd), GFP_KERNEL);
+
+ if (fd) {
+ fd->rec_cpu_num = -1; /* no cpu affinity by default */
+ fd->mm = current->mm;
+ atomic_inc(&fd->mm->mm_count);
+ }
+
+ fp->private_data = fd;
+
+ return fd ? 0 : -ENOMEM;
+}
+
+static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_user_info uinfo;
+ struct hfi1_tid_info tinfo;
+ int ret = 0;
+ unsigned long addr;
+ int uval = 0;
+ unsigned long ul_uval = 0;
+ u16 uval16 = 0;
+
+ hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
+ if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
+ cmd != HFI1_IOCTL_GET_VERS &&
+ !uctxt)
+ return -EINVAL;
+
+ switch (cmd) {
+ case HFI1_IOCTL_ASSIGN_CTXT:
+ if (uctxt)
+ return -EINVAL;
+
+ if (copy_from_user(&uinfo,
+ (struct hfi1_user_info __user *)arg,
+ sizeof(uinfo)))
+ return -EFAULT;
+
+ ret = assign_ctxt(fp, &uinfo);
+ if (ret < 0)
+ return ret;
+ ret = setup_ctxt(fp);
+ if (ret)
+ return ret;
+ ret = user_init(fp);
+ break;
+ case HFI1_IOCTL_CTXT_INFO:
+ ret = get_ctxt_info(fp, (void __user *)(unsigned long)arg,
+ sizeof(struct hfi1_ctxt_info));
+ break;
+ case HFI1_IOCTL_USER_INFO:
+ ret = get_base_info(fp, (void __user *)(unsigned long)arg,
+ sizeof(struct hfi1_base_info));
+ break;
+ case HFI1_IOCTL_CREDIT_UPD:
+ if (uctxt)
+ sc_return_credits(uctxt->sc);
+ break;
+
+ case HFI1_IOCTL_TID_UPDATE:
+ if (copy_from_user(&tinfo,
+ (struct hfi11_tid_info __user *)arg,
+ sizeof(tinfo)))
+ return -EFAULT;
+
+ ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
+ if (!ret) {
+ /*
+ * Copy the number of tidlist entries we used
+ * and the length of the buffer we registered.
+ * These fields are adjacent in the structure so
+ * we can copy them at the same time.
+ */
+ addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+ if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+ sizeof(tinfo.tidcnt) +
+ sizeof(tinfo.length)))
+ ret = -EFAULT;
+ }
+ break;
+
+ case HFI1_IOCTL_TID_FREE:
+ if (copy_from_user(&tinfo,
+ (struct hfi11_tid_info __user *)arg,
+ sizeof(tinfo)))
+ return -EFAULT;
+
+ ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
+ if (ret)
+ break;
+ addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+ if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+ sizeof(tinfo.tidcnt)))
+ ret = -EFAULT;
+ break;
+
+ case HFI1_IOCTL_TID_INVAL_READ:
+ if (copy_from_user(&tinfo,
+ (struct hfi11_tid_info __user *)arg,
+ sizeof(tinfo)))
+ return -EFAULT;
+
+ ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
+ if (ret)
+ break;
+ addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
+ if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+ sizeof(tinfo.tidcnt)))
+ ret = -EFAULT;
+ break;
+
+ case HFI1_IOCTL_RECV_CTRL:
+ ret = get_user(uval, (int __user *)arg);
+ if (ret != 0)
+ return -EFAULT;
+ ret = manage_rcvq(uctxt, fd->subctxt, uval);
+ break;
+
+ case HFI1_IOCTL_POLL_TYPE:
+ ret = get_user(uval, (int __user *)arg);
+ if (ret != 0)
+ return -EFAULT;
+ uctxt->poll_type = (typeof(uctxt->poll_type))uval;
+ break;
+
+ case HFI1_IOCTL_ACK_EVENT:
+ ret = get_user(ul_uval, (unsigned long __user *)arg);
+ if (ret != 0)
+ return -EFAULT;
+ ret = user_event_ack(uctxt, fd->subctxt, ul_uval);
+ break;
+
+ case HFI1_IOCTL_SET_PKEY:
+ ret = get_user(uval16, (u16 __user *)arg);
+ if (ret != 0)
+ return -EFAULT;
+ if (HFI1_CAP_IS_USET(PKEY_CHECK))
+ ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16);
+ else
+ return -EPERM;
+ break;
+
+ case HFI1_IOCTL_CTXT_RESET: {
+ struct send_context *sc;
+ struct hfi1_devdata *dd;
+
+ if (!uctxt || !uctxt->dd || !uctxt->sc)
+ return -EINVAL;
+
+ /*
+ * There is no protection here. User level has to
+ * guarantee that no one will be writing to the send
+ * context while it is being re-initialized.
+ * If user level breaks that guarantee, it will break
+ * it's own context and no one else's.
+ */
+ dd = uctxt->dd;
+ sc = uctxt->sc;
+ /*
+ * Wait until the interrupt handler has marked the
+ * context as halted or frozen. Report error if we time
+ * out.
+ */
+ wait_event_interruptible_timeout(
+ sc->halt_wait, (sc->flags & SCF_HALTED),
+ msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+ if (!(sc->flags & SCF_HALTED))
+ return -ENOLCK;
+
+ /*
+ * If the send context was halted due to a Freeze,
+ * wait until the device has been "unfrozen" before
+ * resetting the context.
+ */
+ if (sc->flags & SCF_FROZEN) {
+ wait_event_interruptible_timeout(
+ dd->event_queue,
+ !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
+ msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+ if (dd->flags & HFI1_FROZEN)
+ return -ENOLCK;
+
+ if (dd->flags & HFI1_FORCED_FREEZE)
+ /*
+ * Don't allow context reset if we are into
+ * forced freeze
+ */
+ return -ENODEV;
+
+ sc_disable(sc);
+ ret = sc_enable(sc);
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
+ uctxt->ctxt);
+ } else {
+ ret = sc_restart(sc);
+ }
+ if (!ret)
+ sc_return_credits(sc);
+ break;
+ }
+
+ case HFI1_IOCTL_GET_VERS:
+ uval = HFI1_USER_SWVERSION;
+ if (put_user(uval, (int __user *)arg))
+ return -EFAULT;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+ struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
+ struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+ struct hfi1_user_sdma_comp_q *cq = fd->cq;
+ int done = 0, reqs = 0;
+ unsigned long dim = from->nr_segs;
+
+ if (!cq || !pq)
+ return -EIO;
+
+ if (!iter_is_iovec(from) || !dim)
+ return -EINVAL;
+
+ hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
+ fd->uctxt->ctxt, fd->subctxt, dim);
+
+ if (atomic_read(&pq->n_reqs) == pq->n_max_reqs)
+ return -ENOSPC;
+
+ while (dim) {
+ int ret;
+ unsigned long count = 0;
+
+ ret = hfi1_user_sdma_process_request(
+ kiocb->ki_filp, (struct iovec *)(from->iov + done),
+ dim, &count);
+ if (ret) {
+ reqs = ret;
+ break;
+ }
+ dim -= count;
+ done += count;
+ reqs++;
+ }
+
+ return reqs;
+}
+
+static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd;
+ unsigned long flags, pfn;
+ u64 token = vma->vm_pgoff << PAGE_SHIFT,
+ memaddr = 0;
+ u8 subctxt, mapio = 0, vmf = 0, type;
+ ssize_t memlen = 0;
+ int ret = 0;
+ u16 ctxt;
+
+ if (!is_valid_mmap(token) || !uctxt ||
+ !(vma->vm_flags & VM_SHARED)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ dd = uctxt->dd;
+ ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
+ subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
+ type = HFI1_MMAP_TOKEN_GET(TYPE, token);
+ if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ flags = vma->vm_flags;
+
+ switch (type) {
+ case PIO_BUFS:
+ case PIO_BUFS_SOP:
+ memaddr = ((dd->physaddr + TXE_PIO_SEND) +
+ /* chip pio base */
+ (uctxt->sc->hw_context * BIT(16))) +
+ /* 64K PIO space / ctxt */
+ (type == PIO_BUFS_SOP ?
+ (TXE_PIO_SIZE / 2) : 0); /* sop? */
+ /*
+ * Map only the amount allocated to the context, not the
+ * entire available context's PIO space.
+ */
+ memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
+ flags &= ~VM_MAYREAD;
+ flags |= VM_DONTCOPY | VM_DONTEXPAND;
+ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+ mapio = 1;
+ break;
+ case PIO_CRED:
+ if (flags & VM_WRITE) {
+ ret = -EPERM;
+ goto done;
+ }
+ /*
+ * The credit return location for this context could be on the
+ * second or third page allocated for credit returns (if number
+ * of enabled contexts > 64 and 128 respectively).
+ */
+ memaddr = dd->cr_base[uctxt->numa_id].pa +
+ (((u64)uctxt->sc->hw_free -
+ (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
+ memlen = PAGE_SIZE;
+ flags &= ~VM_MAYWRITE;
+ flags |= VM_DONTCOPY | VM_DONTEXPAND;
+ /*
+ * The driver has already allocated memory for credit
+ * returns and programmed it into the chip. Has that
+ * memory been flagged as non-cached?
+ */
+ /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
+ mapio = 1;
+ break;
+ case RCV_HDRQ:
+ memaddr = uctxt->rcvhdrq_phys;
+ memlen = uctxt->rcvhdrq_size;
+ break;
+ case RCV_EGRBUF: {
+ unsigned long addr;
+ int i;
+ /*
+ * The RcvEgr buffer need to be handled differently
+ * as multiple non-contiguous pages need to be mapped
+ * into the user process.
+ */
+ memlen = uctxt->egrbufs.size;
+ if ((vma->vm_end - vma->vm_start) != memlen) {
+ dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
+ (vma->vm_end - vma->vm_start), memlen);
+ ret = -EINVAL;
+ goto done;
+ }
+ if (vma->vm_flags & VM_WRITE) {
+ ret = -EPERM;
+ goto done;
+ }
+ vma->vm_flags &= ~VM_MAYWRITE;
+ addr = vma->vm_start;
+ for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
+ ret = remap_pfn_range(
+ vma, addr,
+ uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT,
+ uctxt->egrbufs.buffers[i].len,
+ vma->vm_page_prot);
+ if (ret < 0)
+ goto done;
+ addr += uctxt->egrbufs.buffers[i].len;
+ }
+ ret = 0;
+ goto done;
+ }
+ case UREGS:
+ /*
+ * Map only the page that contains this context's user
+ * registers.
+ */
+ memaddr = (unsigned long)
+ (dd->physaddr + RXE_PER_CONTEXT_USER)
+ + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
+ /*
+ * TidFlow table is on the same page as the rest of the
+ * user registers.
+ */
+ memlen = PAGE_SIZE;
+ flags |= VM_DONTCOPY | VM_DONTEXPAND;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ mapio = 1;
+ break;
+ case EVENTS:
+ /*
+ * Use the page where this context's flags are. User level
+ * knows where it's own bitmap is within the page.
+ */
+ memaddr = (unsigned long)(dd->events +
+ ((uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
+ memlen = PAGE_SIZE;
+ /*
+ * v3.7 removes VM_RESERVED but the effect is kept by
+ * using VM_IO.
+ */
+ flags |= VM_IO | VM_DONTEXPAND;
+ vmf = 1;
+ break;
+ case STATUS:
+ memaddr = kvirt_to_phys((void *)dd->status);
+ memlen = PAGE_SIZE;
+ flags |= VM_IO | VM_DONTEXPAND;
+ break;
+ case RTAIL:
+ if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
+ /*
+ * If the memory allocation failed, the context alloc
+ * also would have failed, so we would never get here
+ */
+ ret = -EINVAL;
+ goto done;
+ }
+ if (flags & VM_WRITE) {
+ ret = -EPERM;
+ goto done;
+ }
+ memaddr = uctxt->rcvhdrqtailaddr_phys;
+ memlen = PAGE_SIZE;
+ flags &= ~VM_MAYWRITE;
+ break;
+ case SUBCTXT_UREGS:
+ memaddr = (u64)uctxt->subctxt_uregbase;
+ memlen = PAGE_SIZE;
+ flags |= VM_IO | VM_DONTEXPAND;
+ vmf = 1;
+ break;
+ case SUBCTXT_RCV_HDRQ:
+ memaddr = (u64)uctxt->subctxt_rcvhdr_base;
+ memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
+ flags |= VM_IO | VM_DONTEXPAND;
+ vmf = 1;
+ break;
+ case SUBCTXT_EGRBUF:
+ memaddr = (u64)uctxt->subctxt_rcvegrbuf;
+ memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
+ flags |= VM_IO | VM_DONTEXPAND;
+ flags &= ~VM_MAYWRITE;
+ vmf = 1;
+ break;
+ case SDMA_COMP: {
+ struct hfi1_user_sdma_comp_q *cq = fd->cq;
+
+ if (!cq) {
+ ret = -EFAULT;
+ goto done;
+ }
+ memaddr = (u64)cq->comps;
+ memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
+ flags |= VM_IO | VM_DONTEXPAND;
+ vmf = 1;
+ break;
+ }
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if ((vma->vm_end - vma->vm_start) != memlen) {
+ hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
+ uctxt->ctxt, fd->subctxt,
+ (vma->vm_end - vma->vm_start), memlen);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ vma->vm_flags = flags;
+ hfi1_cdbg(PROC,
+ "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
+ ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
+ vma->vm_end - vma->vm_start, vma->vm_flags);
+ pfn = (unsigned long)(memaddr >> PAGE_SHIFT);
+ if (vmf) {
+ vma->vm_pgoff = pfn;
+ vma->vm_ops = &vm_ops;
+ ret = 0;
+ } else if (mapio) {
+ ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+ vma->vm_page_prot);
+ } else {
+ ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+ vma->vm_page_prot);
+ }
+done:
+ return ret;
+}
+
+/*
+ * Local (non-chip) user memory is not mapped right away but as it is
+ * accessed by the user-level code.
+ */
+static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page;
+
+ page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+ if (!page)
+ return VM_FAULT_SIGBUS;
+
+ get_page(page);
+ vmf->page = page;
+
+ return 0;
+}
+
+static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
+{
+ struct hfi1_ctxtdata *uctxt;
+ unsigned pollflag;
+
+ uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
+ if (!uctxt)
+ pollflag = POLLERR;
+ else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
+ pollflag = poll_urgent(fp, pt);
+ else if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
+ pollflag = poll_next(fp, pt);
+ else /* invalid */
+ pollflag = POLLERR;
+
+ return pollflag;
+}
+
+static int hfi1_file_close(struct inode *inode, struct file *fp)
+{
+ struct hfi1_filedata *fdata = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+ struct hfi1_devdata *dd = container_of(inode->i_cdev,
+ struct hfi1_devdata,
+ user_cdev);
+ unsigned long flags, *ev;
+
+ fp->private_data = NULL;
+
+ if (!uctxt)
+ goto done;
+
+ hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
+ mutex_lock(&hfi1_mutex);
+
+ flush_wc();
+ /* drain user sdma queue */
+ hfi1_user_sdma_free_queues(fdata);
+
+ /* release the cpu */
+ hfi1_put_proc_affinity(fdata->rec_cpu_num);
+
+ /*
+ * Clear any left over, unhandled events so the next process that
+ * gets this context doesn't get confused.
+ */
+ ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
+ *ev = 0;
+
+ if (--uctxt->cnt) {
+ uctxt->active_slaves &= ~(1 << fdata->subctxt);
+ mutex_unlock(&hfi1_mutex);
+ goto done;
+ }
+
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ /*
+ * Disable receive context and interrupt available, reset all
+ * RcvCtxtCtrl bits to default values.
+ */
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+ HFI1_RCVCTRL_TIDFLOW_DIS |
+ HFI1_RCVCTRL_INTRAVAIL_DIS |
+ HFI1_RCVCTRL_TAILUPD_DIS |
+ HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+ HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+ HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
+ /* Clear the context's J_KEY */
+ hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
+ /*
+ * Reset context integrity checks to default.
+ * (writes to CSRs probably belong in chip.c)
+ */
+ write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
+ hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
+ sc_disable(uctxt->sc);
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+ dd->rcd[uctxt->ctxt] = NULL;
+
+ hfi1_user_exp_rcv_free(fdata);
+ hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
+
+ uctxt->rcvwait_to = 0;
+ uctxt->piowait_to = 0;
+ uctxt->rcvnowait = 0;
+ uctxt->pionowait = 0;
+ uctxt->event_flags = 0;
+
+ hfi1_stats.sps_ctxts--;
+ if (++dd->freectxts == dd->num_user_contexts)
+ aspm_enable_all(dd);
+ mutex_unlock(&hfi1_mutex);
+ hfi1_free_ctxtdata(dd, uctxt);
+done:
+ mmdrop(fdata->mm);
+ kobject_put(&dd->kobj);
+ kfree(fdata);
+ return 0;
+}
+
+/*
+ * Convert kernel *virtual* addresses to physical addresses.
+ * This is used to vmalloc'ed addresses.
+ */
+static u64 kvirt_to_phys(void *addr)
+{
+ struct page *page;
+ u64 paddr = 0;
+
+ page = vmalloc_to_page(addr);
+ if (page)
+ paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+ return paddr;
+}
+
+static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
+{
+ int i_minor, ret = 0;
+ unsigned int swmajor, swminor;
+
+ swmajor = uinfo->userversion >> 16;
+ if (swmajor != HFI1_USER_SWMAJOR) {
+ ret = -ENODEV;
+ goto done;
+ }
+
+ swminor = uinfo->userversion & 0xffff;
+
+ mutex_lock(&hfi1_mutex);
+ /* First, lets check if we need to setup a shared context? */
+ if (uinfo->subctxt_cnt) {
+ struct hfi1_filedata *fd = fp->private_data;
+
+ ret = find_shared_ctxt(fp, uinfo);
+ if (ret < 0)
+ goto done_unlock;
+ if (ret) {
+ fd->rec_cpu_num =
+ hfi1_get_proc_affinity(fd->uctxt->numa_id);
+ }
+ }
+
+ /*
+ * We execute the following block if we couldn't find a
+ * shared context or if context sharing is not required.
+ */
+ if (!ret) {
+ i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
+ ret = get_user_context(fp, uinfo, i_minor);
+ }
+done_unlock:
+ mutex_unlock(&hfi1_mutex);
+done:
+ return ret;
+}
+
+static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
+ int devno)
+{
+ struct hfi1_devdata *dd = NULL;
+ int devmax, npresent, nup;
+
+ devmax = hfi1_count_units(&npresent, &nup);
+ if (!npresent)
+ return -ENXIO;
+
+ if (!nup)
+ return -ENETDOWN;
+
+ dd = hfi1_lookup(devno);
+ if (!dd)
+ return -ENODEV;
+ else if (!dd->freectxts)
+ return -EBUSY;
+
+ return allocate_ctxt(fp, dd, uinfo);
+}
+
+static int find_shared_ctxt(struct file *fp,
+ const struct hfi1_user_info *uinfo)
+{
+ int devmax, ndev, i;
+ int ret = 0;
+ struct hfi1_filedata *fd = fp->private_data;
+
+ devmax = hfi1_count_units(NULL, NULL);
+
+ for (ndev = 0; ndev < devmax; ndev++) {
+ struct hfi1_devdata *dd = hfi1_lookup(ndev);
+
+ if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
+ continue;
+ for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+ struct hfi1_ctxtdata *uctxt = dd->rcd[i];
+
+ /* Skip ctxts which are not yet open */
+ if (!uctxt || !uctxt->cnt)
+ continue;
+ /* Skip ctxt if it doesn't match the requested one */
+ if (memcmp(uctxt->uuid, uinfo->uuid,
+ sizeof(uctxt->uuid)) ||
+ uctxt->jkey != generate_jkey(current_uid()) ||
+ uctxt->subctxt_id != uinfo->subctxt_id ||
+ uctxt->subctxt_cnt != uinfo->subctxt_cnt)
+ continue;
+
+ /* Verify the sharing process matches the master */
+ if (uctxt->userversion != uinfo->userversion ||
+ uctxt->cnt >= uctxt->subctxt_cnt) {
+ ret = -EINVAL;
+ goto done;
+ }
+ fd->uctxt = uctxt;
+ fd->subctxt = uctxt->cnt++;
+ uctxt->active_slaves |= 1 << fd->subctxt;
+ ret = 1;
+ goto done;
+ }
+ }
+
+done:
+ return ret;
+}
+
+static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
+ struct hfi1_user_info *uinfo)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt;
+ unsigned ctxt;
+ int ret, numa;
+
+ if (dd->flags & HFI1_FROZEN) {
+ /*
+ * Pick an error that is unique from all other errors
+ * that are returned so the user process knows that
+ * it tried to allocate while the SPC was frozen. It
+ * it should be able to retry with success in a short
+ * while.
+ */
+ return -EIO;
+ }
+
+ for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
+ if (!dd->rcd[ctxt])
+ break;
+
+ if (ctxt == dd->num_rcv_contexts)
+ return -EBUSY;
+
+ /*
+ * If we don't have a NUMA node requested, preference is towards
+ * device NUMA node.
+ */
+ fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node);
+ if (fd->rec_cpu_num != -1)
+ numa = cpu_to_node(fd->rec_cpu_num);
+ else
+ numa = numa_node_id();
+ uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa);
+ if (!uctxt) {
+ dd_dev_err(dd,
+ "Unable to allocate ctxtdata memory, failing open\n");
+ return -ENOMEM;
+ }
+ hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
+ uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
+ uctxt->numa_id);
+
+ /*
+ * Allocate and enable a PIO send context.
+ */
+ uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
+ uctxt->dd->node);
+ if (!uctxt->sc)
+ return -ENOMEM;
+
+ hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
+ uctxt->sc->hw_context);
+ ret = sc_enable(uctxt->sc);
+ if (ret)
+ return ret;
+ /*
+ * Setup shared context resources if the user-level has requested
+ * shared contexts and this is the 'master' process.
+ * This has to be done here so the rest of the sub-contexts find the
+ * proper master.
+ */
+ if (uinfo->subctxt_cnt && !fd->subctxt) {
+ ret = init_subctxts(uctxt, uinfo);
+ /*
+ * On error, we don't need to disable and de-allocate the
+ * send context because it will be done during file close
+ */
+ if (ret)
+ return ret;
+ }
+ uctxt->userversion = uinfo->userversion;
+ uctxt->flags = hfi1_cap_mask; /* save current flag state */
+ init_waitqueue_head(&uctxt->wait);
+ strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
+ memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
+ uctxt->jkey = generate_jkey(current_uid());
+ INIT_LIST_HEAD(&uctxt->sdma_queues);
+ spin_lock_init(&uctxt->sdma_qlock);
+ hfi1_stats.sps_ctxts++;
+ /*
+ * Disable ASPM when there are open user/PSM contexts to avoid
+ * issues with ASPM L1 exit latency
+ */
+ if (dd->freectxts-- == dd->num_user_contexts)
+ aspm_disable_all(dd);
+ fd->uctxt = uctxt;
+
+ return 0;
+}
+
+static int init_subctxts(struct hfi1_ctxtdata *uctxt,
+ const struct hfi1_user_info *uinfo)
+{
+ unsigned num_subctxts;
+
+ num_subctxts = uinfo->subctxt_cnt;
+ if (num_subctxts > HFI1_MAX_SHARED_CTXTS)
+ return -EINVAL;
+
+ uctxt->subctxt_cnt = uinfo->subctxt_cnt;
+ uctxt->subctxt_id = uinfo->subctxt_id;
+ uctxt->active_slaves = 1;
+ uctxt->redirect_seq_cnt = 1;
+ set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+
+ return 0;
+}
+
+static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
+{
+ int ret = 0;
+ unsigned num_subctxts = uctxt->subctxt_cnt;
+
+ uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
+ if (!uctxt->subctxt_uregbase) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+ /* We can take the size of the RcvHdr Queue from the master */
+ uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
+ num_subctxts);
+ if (!uctxt->subctxt_rcvhdr_base) {
+ ret = -ENOMEM;
+ goto bail_ureg;
+ }
+
+ uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
+ num_subctxts);
+ if (!uctxt->subctxt_rcvegrbuf) {
+ ret = -ENOMEM;
+ goto bail_rhdr;
+ }
+ goto bail;
+bail_rhdr:
+ vfree(uctxt->subctxt_rcvhdr_base);
+bail_ureg:
+ vfree(uctxt->subctxt_uregbase);
+ uctxt->subctxt_uregbase = NULL;
+bail:
+ return ret;
+}
+
+static int user_init(struct file *fp)
+{
+ unsigned int rcvctrl_ops = 0;
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+
+ /* make sure that the context has already been setup */
+ if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
+ return -EFAULT;
+
+ /* initialize poll variables... */
+ uctxt->urgent = 0;
+ uctxt->urgent_poll = 0;
+
+ /*
+ * Now enable the ctxt for receive.
+ * For chips that are set to DMA the tail register to memory
+ * when they change (and when the update bit transitions from
+ * 0 to 1. So for those chips, we turn it off and then back on.
+ * This will (very briefly) affect any other open ctxts, but the
+ * duration is very short, and therefore isn't an issue. We
+ * explicitly set the in-memory tail copy to 0 beforehand, so we
+ * don't have to wait to be sure the DMA update has happened
+ * (chip resets head/tail to 0 on transition to enable).
+ */
+ if (uctxt->rcvhdrtail_kvaddr)
+ clear_rcvhdrtail(uctxt);
+
+ /* Setup J_KEY before enabling the context */
+ hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
+
+ rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+ if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
+ rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
+ /*
+ * Ignore the bit in the flags for now until proper
+ * support for multiple packet per rcv array entry is
+ * added.
+ */
+ if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+ rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+ if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+ rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+ if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+ rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+ /*
+ * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
+ * We can't rely on the correct value to be set from prior
+ * uses of the chip or ctxt. Therefore, add the rcvctrl op
+ * for both cases.
+ */
+ if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL))
+ rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+ else
+ rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
+ hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
+
+ /* Notify any waiting slaves */
+ if (uctxt->subctxt_cnt) {
+ clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+ wake_up(&uctxt->wait);
+ }
+
+ return 0;
+}
+
+static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
+{
+ struct hfi1_ctxt_info cinfo;
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ int ret = 0;
+
+ memset(&cinfo, 0, sizeof(cinfo));
+ cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) &
+ HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) |
+ HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
+ HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
+ /* adjust flag if this fd is not able to cache */
+ if (!fd->handler)
+ cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
+
+ cinfo.num_active = hfi1_count_active_units();
+ cinfo.unit = uctxt->dd->unit;
+ cinfo.ctxt = uctxt->ctxt;
+ cinfo.subctxt = fd->subctxt;
+ cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
+ uctxt->dd->rcv_entries.group_size) +
+ uctxt->expected_count;
+ cinfo.credits = uctxt->sc->credits;
+ cinfo.numa_node = uctxt->numa_id;
+ cinfo.rec_cpu = fd->rec_cpu_num;
+ cinfo.send_ctxt = uctxt->sc->hw_context;
+
+ cinfo.egrtids = uctxt->egrbufs.alloced;
+ cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+ cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
+ cinfo.sdma_ring_size = fd->cq->nentries;
+ cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
+
+ trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
+ if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int setup_ctxt(struct file *fp)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd = uctxt->dd;
+ int ret = 0;
+
+ /*
+ * Context should be set up only once, including allocation and
+ * programming of eager buffers. This is done if context sharing
+ * is not requested or by the master process.
+ */
+ if (!uctxt->subctxt_cnt || !fd->subctxt) {
+ ret = hfi1_init_ctxt(uctxt->sc);
+ if (ret)
+ goto done;
+
+ /* Now allocate the RcvHdr queue and eager buffers. */
+ ret = hfi1_create_rcvhdrq(dd, uctxt);
+ if (ret)
+ goto done;
+ ret = hfi1_setup_eagerbufs(uctxt);
+ if (ret)
+ goto done;
+ if (uctxt->subctxt_cnt && !fd->subctxt) {
+ ret = setup_subctxt(uctxt);
+ if (ret)
+ goto done;
+ }
+ } else {
+ ret = wait_event_interruptible(uctxt->wait, !test_bit(
+ HFI1_CTXT_MASTER_UNINIT,
+ &uctxt->event_flags));
+ if (ret)
+ goto done;
+ }
+
+ ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
+ if (ret)
+ goto done;
+ /*
+ * Expected receive has to be setup for all processes (including
+ * shared contexts). However, it has to be done after the master
+ * context has been fully configured as it depends on the
+ * eager/expected split of the RcvArray entries.
+ * Setting it up here ensures that the subcontexts will be waiting
+ * (due to the above wait_event_interruptible() until the master
+ * is setup.
+ */
+ ret = hfi1_user_exp_rcv_init(fp);
+ if (ret)
+ goto done;
+
+ set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
+done:
+ return ret;
+}
+
+static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
+{
+ struct hfi1_base_info binfo;
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd = uctxt->dd;
+ ssize_t sz;
+ unsigned offset;
+ int ret = 0;
+
+ trace_hfi1_uctxtdata(uctxt->dd, uctxt);
+
+ memset(&binfo, 0, sizeof(binfo));
+ binfo.hw_version = dd->revision;
+ binfo.sw_version = HFI1_KERN_SWVERSION;
+ binfo.bthqp = kdeth_qp;
+ binfo.jkey = uctxt->jkey;
+ /*
+ * If more than 64 contexts are enabled the allocated credit
+ * return will span two or three contiguous pages. Since we only
+ * map the page containing the context's credit return address,
+ * we need to calculate the offset in the proper page.
+ */
+ offset = ((u64)uctxt->sc->hw_free -
+ (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
+ binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
+ fd->subctxt, offset);
+ binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
+ fd->subctxt,
+ uctxt->sc->base_addr);
+ binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
+ uctxt->ctxt,
+ fd->subctxt,
+ uctxt->sc->base_addr);
+ binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
+ fd->subctxt,
+ uctxt->rcvhdrq);
+ binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
+ fd->subctxt,
+ uctxt->egrbufs.rcvtids[0].phys);
+ binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
+ fd->subctxt, 0);
+ /*
+ * user regs are at
+ * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
+ */
+ binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
+ fd->subctxt, 0);
+ offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS) + fd->subctxt) *
+ sizeof(*dd->events));
+ binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
+ fd->subctxt,
+ offset);
+ binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
+ fd->subctxt,
+ dd->status);
+ if (HFI1_CAP_IS_USET(DMA_RTAIL))
+ binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
+ fd->subctxt, 0);
+ if (uctxt->subctxt_cnt) {
+ binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
+ uctxt->ctxt,
+ fd->subctxt, 0);
+ binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
+ uctxt->ctxt,
+ fd->subctxt, 0);
+ binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
+ uctxt->ctxt,
+ fd->subctxt, 0);
+ }
+ sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
+ if (copy_to_user(ubase, &binfo, sz))
+ ret = -EFAULT;
+ return ret;
+}
+
+static unsigned int poll_urgent(struct file *fp,
+ struct poll_table_struct *pt)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned pollflag;
+
+ poll_wait(fp, &uctxt->wait, pt);
+
+ spin_lock_irq(&dd->uctxt_lock);
+ if (uctxt->urgent != uctxt->urgent_poll) {
+ pollflag = POLLIN | POLLRDNORM;
+ uctxt->urgent_poll = uctxt->urgent;
+ } else {
+ pollflag = 0;
+ set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
+ }
+ spin_unlock_irq(&dd->uctxt_lock);
+
+ return pollflag;
+}
+
+static unsigned int poll_next(struct file *fp,
+ struct poll_table_struct *pt)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned pollflag;
+
+ poll_wait(fp, &uctxt->wait, pt);
+
+ spin_lock_irq(&dd->uctxt_lock);
+ if (hdrqempty(uctxt)) {
+ set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
+ pollflag = 0;
+ } else {
+ pollflag = POLLIN | POLLRDNORM;
+ }
+ spin_unlock_irq(&dd->uctxt_lock);
+
+ return pollflag;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
+{
+ struct hfi1_ctxtdata *uctxt;
+ struct hfi1_devdata *dd = ppd->dd;
+ unsigned ctxt;
+ int ret = 0;
+ unsigned long flags;
+
+ if (!dd->events) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
+ ctxt++) {
+ uctxt = dd->rcd[ctxt];
+ if (uctxt) {
+ unsigned long *evs = dd->events +
+ (uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS;
+ int i;
+ /*
+ * subctxt_cnt is 0 if not shared, so do base
+ * separately, first, then remaining subctxt, if any
+ */
+ set_bit(evtbit, evs);
+ for (i = 1; i < uctxt->subctxt_cnt; i++)
+ set_bit(evtbit, evs + i);
+ }
+ }
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+done:
+ return ret;
+}
+
+/**
+ * manage_rcvq - manage a context's receive queue
+ * @uctxt: the context
+ * @subctxt: the sub-context
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions. start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+ int start_stop)
+{
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned int rcvctrl_op;
+
+ if (subctxt)
+ goto bail;
+ /* atomically clear receive enable ctxt. */
+ if (start_stop) {
+ /*
+ * On enable, force in-memory copy of the tail register to
+ * 0, so that protocol code doesn't have to worry about
+ * whether or not the chip has yet updated the in-memory
+ * copy or not on return from the system call. The chip
+ * always resets it's tail register back to 0 on a
+ * transition from disabled to enabled.
+ */
+ if (uctxt->rcvhdrtail_kvaddr)
+ clear_rcvhdrtail(uctxt);
+ rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
+ } else {
+ rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
+ }
+ hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
+ /* always; new head should be equal to new tail; see above */
+bail:
+ return 0;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
+ unsigned long events)
+{
+ int i;
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned long *evs;
+
+ if (!dd->events)
+ return 0;
+
+ evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS) + subctxt;
+
+ for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
+ if (!test_bit(i, &events))
+ continue;
+ clear_bit(i, evs);
+ }
+ return 0;
+}
+
+static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+ u16 pkey)
+{
+ int ret = -ENOENT, i, intable = 0;
+ struct hfi1_pportdata *ppd = uctxt->ppd;
+ struct hfi1_devdata *dd = uctxt->dd;
+
+ if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
+ if (pkey == ppd->pkeys[i]) {
+ intable = 1;
+ break;
+ }
+
+ if (intable)
+ ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
+done:
+ return ret;
+}
+
+static void user_remove(struct hfi1_devdata *dd)
+{
+
+ hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+}
+
+static int user_add(struct hfi1_devdata *dd)
+{
+ char name[10];
+ int ret;
+
+ snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
+ ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
+ &dd->user_cdev, &dd->user_device,
+ true, &dd->kobj);
+ if (ret)
+ user_remove(dd);
+
+ return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int hfi1_device_create(struct hfi1_devdata *dd)
+{
+ return user_add(dd);
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void hfi1_device_remove(struct hfi1_devdata *dd)
+{
+ user_remove(dd);
+}
diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c
new file mode 100644
index 000000000000..13db8eb4f4ec
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/firmware.c
@@ -0,0 +1,2181 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/firmware.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/crc32.h>
+
+#include "hfi.h"
+#include "trace.h"
+
+/*
+ * Make it easy to toggle firmware file name and if it gets loaded by
+ * editing the following. This may be something we do while in development
+ * but not necessarily something a user would ever need to use.
+ */
+#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
+#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
+#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
+#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
+#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
+#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
+#define ALT_FW_8051_NAME_ASIC "hfi1_dc8051_d.fw"
+#define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw"
+#define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw"
+#define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw"
+
+static uint fw_8051_load = 1;
+static uint fw_fabric_serdes_load = 1;
+static uint fw_pcie_serdes_load = 1;
+static uint fw_sbus_load = 1;
+
+/*
+ * Access required in platform.c
+ * Maintains state of whether the platform config was fetched via the
+ * fallback option
+ */
+uint platform_config_load;
+
+/* Firmware file names get set in hfi1_firmware_init() based on the above */
+static char *fw_8051_name;
+static char *fw_fabric_serdes_name;
+static char *fw_sbus_name;
+static char *fw_pcie_serdes_name;
+static char *platform_config_name;
+
+#define SBUS_MAX_POLL_COUNT 100
+#define SBUS_COUNTER(reg, name) \
+ (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
+ ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
+
+/*
+ * Firmware security header.
+ */
+struct css_header {
+ u32 module_type;
+ u32 header_len;
+ u32 header_version;
+ u32 module_id;
+ u32 module_vendor;
+ u32 date; /* BCD yyyymmdd */
+ u32 size; /* in DWORDs */
+ u32 key_size; /* in DWORDs */
+ u32 modulus_size; /* in DWORDs */
+ u32 exponent_size; /* in DWORDs */
+ u32 reserved[22];
+};
+
+/* expected field values */
+#define CSS_MODULE_TYPE 0x00000006
+#define CSS_HEADER_LEN 0x000000a1
+#define CSS_HEADER_VERSION 0x00010000
+#define CSS_MODULE_VENDOR 0x00008086
+
+#define KEY_SIZE 256
+#define MU_SIZE 8
+#define EXPONENT_SIZE 4
+
+/* the file itself */
+struct firmware_file {
+ struct css_header css_header;
+ u8 modulus[KEY_SIZE];
+ u8 exponent[EXPONENT_SIZE];
+ u8 signature[KEY_SIZE];
+ u8 firmware[];
+};
+
+struct augmented_firmware_file {
+ struct css_header css_header;
+ u8 modulus[KEY_SIZE];
+ u8 exponent[EXPONENT_SIZE];
+ u8 signature[KEY_SIZE];
+ u8 r2[KEY_SIZE];
+ u8 mu[MU_SIZE];
+ u8 firmware[];
+};
+
+/* augmented file size difference */
+#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
+ sizeof(struct firmware_file))
+
+struct firmware_details {
+ /* Linux core piece */
+ const struct firmware *fw;
+
+ struct css_header *css_header;
+ u8 *firmware_ptr; /* pointer to binary data */
+ u32 firmware_len; /* length in bytes */
+ u8 *modulus; /* pointer to the modulus */
+ u8 *exponent; /* pointer to the exponent */
+ u8 *signature; /* pointer to the signature */
+ u8 *r2; /* pointer to r2 */
+ u8 *mu; /* pointer to mu */
+ struct augmented_firmware_file dummy_header;
+};
+
+/*
+ * The mutex protects fw_state, fw_err, and all of the firmware_details
+ * variables.
+ */
+static DEFINE_MUTEX(fw_mutex);
+enum fw_state {
+ FW_EMPTY,
+ FW_TRY,
+ FW_FINAL,
+ FW_ERR
+};
+
+static enum fw_state fw_state = FW_EMPTY;
+static int fw_err;
+static struct firmware_details fw_8051;
+static struct firmware_details fw_fabric;
+static struct firmware_details fw_pcie;
+static struct firmware_details fw_sbus;
+static const struct firmware *platform_config;
+
+/* flags for turn_off_spicos() */
+#define SPICO_SBUS 0x1
+#define SPICO_FABRIC 0x2
+#define ENABLE_SPICO_SMASK 0x1
+
+/* security block commands */
+#define RSA_CMD_INIT 0x1
+#define RSA_CMD_START 0x2
+
+/* security block status */
+#define RSA_STATUS_IDLE 0x0
+#define RSA_STATUS_ACTIVE 0x1
+#define RSA_STATUS_DONE 0x2
+#define RSA_STATUS_FAILED 0x3
+
+/* RSA engine timeout, in ms */
+#define RSA_ENGINE_TIMEOUT 100 /* ms */
+
+/* hardware mutex timeout, in ms */
+#define HM_TIMEOUT 10 /* ms */
+
+/* 8051 memory access timeout, in us */
+#define DC8051_ACCESS_TIMEOUT 100 /* us */
+
+/* the number of fabric SerDes on the SBus */
+#define NUM_FABRIC_SERDES 4
+
+/* ASIC_STS_SBUS_RESULT.RESULT_CODE value */
+#define SBUS_READ_COMPLETE 0x4
+
+/* SBus fabric SerDes addresses, one set per HFI */
+static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
+ { 0x01, 0x02, 0x03, 0x04 },
+ { 0x28, 0x29, 0x2a, 0x2b }
+};
+
+/* SBus PCIe SerDes addresses, one set per HFI */
+static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
+ { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
+ 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
+ { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
+ 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
+};
+
+/* SBus PCIe PCS addresses, one set per HFI */
+const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
+ { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
+ 0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
+ { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+ 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
+};
+
+/* SBus fabric SerDes broadcast addresses, one per HFI */
+static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
+static const u8 all_fabric_serdes_broadcast = 0xe1;
+
+/* SBus PCIe SerDes broadcast addresses, one per HFI */
+const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
+static const u8 all_pcie_serdes_broadcast = 0xe0;
+
+/* forwards */
+static void dispose_one_firmware(struct firmware_details *fdet);
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+ struct firmware_details *fdet);
+static void dump_fw_version(struct hfi1_devdata *dd);
+
+/*
+ * Read a single 64-bit value from 8051 data memory.
+ *
+ * Expects:
+ * o caller to have already set up data read, no auto increment
+ * o caller to turn off read enable when finished
+ *
+ * The address argument is a byte offset. Bits 0:2 in the address are
+ * ignored - i.e. the hardware will always do aligned 8-byte reads as if
+ * the lower bits are zero.
+ *
+ * Return 0 on success, -ENXIO on a read error (timeout).
+ */
+static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
+{
+ u64 reg;
+ int count;
+
+ /* start the read at the given address */
+ reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+ << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+ | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+ /* wait until ACCESS_COMPLETED is set */
+ count = 0;
+ while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+ & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+ == 0) {
+ count++;
+ if (count > DC8051_ACCESS_TIMEOUT) {
+ dd_dev_err(dd, "timeout reading 8051 data\n");
+ return -ENXIO;
+ }
+ ndelay(10);
+ }
+
+ /* gather the data */
+ *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
+
+ return 0;
+}
+
+/*
+ * Read 8051 data starting at addr, for len bytes. Will read in 8-byte chunks.
+ * Return 0 on success, -errno on error.
+ */
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
+{
+ unsigned long flags;
+ u32 done;
+ int ret = 0;
+
+ spin_lock_irqsave(&dd->dc8051_memlock, flags);
+
+ /* data read set-up, no auto-increment */
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+ for (done = 0; done < len; addr += 8, done += 8, result++) {
+ ret = __read_8051_data(dd, addr, result);
+ if (ret)
+ break;
+ }
+
+ /* turn off read enable */
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+
+ spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
+
+ return ret;
+}
+
+/*
+ * Write data or code to the 8051 code or data RAM.
+ */
+static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
+ const u8 *data, u32 len)
+{
+ u64 reg;
+ u32 offset;
+ int aligned, count;
+
+ /* check alignment */
+ aligned = ((unsigned long)data & 0x7) == 0;
+
+ /* write set-up */
+ reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
+ | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
+
+ reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+ << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+ | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+ /* write */
+ for (offset = 0; offset < len; offset += 8) {
+ int bytes = len - offset;
+
+ if (bytes < 8) {
+ reg = 0;
+ memcpy(&reg, &data[offset], bytes);
+ } else if (aligned) {
+ reg = *(u64 *)&data[offset];
+ } else {
+ memcpy(&reg, &data[offset], 8);
+ }
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
+
+ /* wait until ACCESS_COMPLETED is set */
+ count = 0;
+ while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+ & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+ == 0) {
+ count++;
+ if (count > DC8051_ACCESS_TIMEOUT) {
+ dd_dev_err(dd, "timeout writing 8051 data\n");
+ return -ENXIO;
+ }
+ udelay(1);
+ }
+ }
+
+ /* turn off write access, auto increment (also sets to data access) */
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+ write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+ return 0;
+}
+
+/* return 0 if values match, non-zero and complain otherwise */
+static int invalid_header(struct hfi1_devdata *dd, const char *what,
+ u32 actual, u32 expected)
+{
+ if (actual == expected)
+ return 0;
+
+ dd_dev_err(dd,
+ "invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
+ what, expected, actual);
+ return 1;
+}
+
+/*
+ * Verify that the static fields in the CSS header match.
+ */
+static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
+{
+ /* verify CSS header fields (most sizes are in DW, so add /4) */
+ if (invalid_header(dd, "module_type", css->module_type,
+ CSS_MODULE_TYPE) ||
+ invalid_header(dd, "header_len", css->header_len,
+ (sizeof(struct firmware_file) / 4)) ||
+ invalid_header(dd, "header_version", css->header_version,
+ CSS_HEADER_VERSION) ||
+ invalid_header(dd, "module_vendor", css->module_vendor,
+ CSS_MODULE_VENDOR) ||
+ invalid_header(dd, "key_size", css->key_size, KEY_SIZE / 4) ||
+ invalid_header(dd, "modulus_size", css->modulus_size,
+ KEY_SIZE / 4) ||
+ invalid_header(dd, "exponent_size", css->exponent_size,
+ EXPONENT_SIZE / 4)) {
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Make sure there are at least some bytes after the prefix.
+ */
+static int payload_check(struct hfi1_devdata *dd, const char *name,
+ long file_size, long prefix_size)
+{
+ /* make sure we have some payload */
+ if (prefix_size >= file_size) {
+ dd_dev_err(dd,
+ "firmware \"%s\", size %ld, must be larger than %ld bytes\n",
+ name, file_size, prefix_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Request the firmware from the system. Extract the pieces and fill in
+ * fdet. If successful, the caller will need to call dispose_one_firmware().
+ * Returns 0 on success, -ERRNO on error.
+ */
+static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
+ struct firmware_details *fdet)
+{
+ struct css_header *css;
+ int ret;
+
+ memset(fdet, 0, sizeof(*fdet));
+
+ ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
+ if (ret) {
+ dd_dev_warn(dd, "cannot find firmware \"%s\", err %d\n",
+ name, ret);
+ return ret;
+ }
+
+ /* verify the firmware */
+ if (fdet->fw->size < sizeof(struct css_header)) {
+ dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
+ ret = -EINVAL;
+ goto done;
+ }
+ css = (struct css_header *)fdet->fw->data;
+
+ hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
+ hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
+ hfi1_cdbg(FIRMWARE, "CSS structure:");
+ hfi1_cdbg(FIRMWARE, " module_type 0x%x", css->module_type);
+ hfi1_cdbg(FIRMWARE, " header_len 0x%03x (0x%03x bytes)",
+ css->header_len, 4 * css->header_len);
+ hfi1_cdbg(FIRMWARE, " header_version 0x%x", css->header_version);
+ hfi1_cdbg(FIRMWARE, " module_id 0x%x", css->module_id);
+ hfi1_cdbg(FIRMWARE, " module_vendor 0x%x", css->module_vendor);
+ hfi1_cdbg(FIRMWARE, " date 0x%x", css->date);
+ hfi1_cdbg(FIRMWARE, " size 0x%03x (0x%03x bytes)",
+ css->size, 4 * css->size);
+ hfi1_cdbg(FIRMWARE, " key_size 0x%03x (0x%03x bytes)",
+ css->key_size, 4 * css->key_size);
+ hfi1_cdbg(FIRMWARE, " modulus_size 0x%03x (0x%03x bytes)",
+ css->modulus_size, 4 * css->modulus_size);
+ hfi1_cdbg(FIRMWARE, " exponent_size 0x%03x (0x%03x bytes)",
+ css->exponent_size, 4 * css->exponent_size);
+ hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
+ fdet->fw->size - sizeof(struct firmware_file));
+
+ /*
+ * If the file does not have a valid CSS header, fail.
+ * Otherwise, check the CSS size field for an expected size.
+ * The augmented file has r2 and mu inserted after the header
+ * was generated, so there will be a known difference between
+ * the CSS header size and the actual file size. Use this
+ * difference to identify an augmented file.
+ *
+ * Note: css->size is in DWORDs, multiply by 4 to get bytes.
+ */
+ ret = verify_css_header(dd, css);
+ if (ret) {
+ dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
+ } else if ((css->size * 4) == fdet->fw->size) {
+ /* non-augmented firmware file */
+ struct firmware_file *ff = (struct firmware_file *)
+ fdet->fw->data;
+
+ /* make sure there are bytes in the payload */
+ ret = payload_check(dd, name, fdet->fw->size,
+ sizeof(struct firmware_file));
+ if (ret == 0) {
+ fdet->css_header = css;
+ fdet->modulus = ff->modulus;
+ fdet->exponent = ff->exponent;
+ fdet->signature = ff->signature;
+ fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
+ fdet->mu = fdet->dummy_header.mu; /* use dummy space */
+ fdet->firmware_ptr = ff->firmware;
+ fdet->firmware_len = fdet->fw->size -
+ sizeof(struct firmware_file);
+ /*
+ * Header does not include r2 and mu - generate here.
+ * For now, fail.
+ */
+ dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
+ ret = -EINVAL;
+ }
+ } else if ((css->size * 4) + AUGMENT_SIZE == fdet->fw->size) {
+ /* augmented firmware file */
+ struct augmented_firmware_file *aff =
+ (struct augmented_firmware_file *)fdet->fw->data;
+
+ /* make sure there are bytes in the payload */
+ ret = payload_check(dd, name, fdet->fw->size,
+ sizeof(struct augmented_firmware_file));
+ if (ret == 0) {
+ fdet->css_header = css;
+ fdet->modulus = aff->modulus;
+ fdet->exponent = aff->exponent;
+ fdet->signature = aff->signature;
+ fdet->r2 = aff->r2;
+ fdet->mu = aff->mu;
+ fdet->firmware_ptr = aff->firmware;
+ fdet->firmware_len = fdet->fw->size -
+ sizeof(struct augmented_firmware_file);
+ }
+ } else {
+ /* css->size check failed */
+ dd_dev_err(dd,
+ "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
+ fdet->fw->size / 4,
+ (fdet->fw->size - AUGMENT_SIZE) / 4,
+ css->size);
+
+ ret = -EINVAL;
+ }
+
+done:
+ /* if returning an error, clean up after ourselves */
+ if (ret)
+ dispose_one_firmware(fdet);
+ return ret;
+}
+
+static void dispose_one_firmware(struct firmware_details *fdet)
+{
+ release_firmware(fdet->fw);
+ /* erase all previous information */
+ memset(fdet, 0, sizeof(*fdet));
+}
+
+/*
+ * Obtain the 4 firmwares from the OS. All must be obtained at once or not
+ * at all. If called with the firmware state in FW_TRY, use alternate names.
+ * On exit, this routine will have set the firmware state to one of FW_TRY,
+ * FW_FINAL, or FW_ERR.
+ *
+ * Must be holding fw_mutex.
+ */
+static void __obtain_firmware(struct hfi1_devdata *dd)
+{
+ int err = 0;
+
+ if (fw_state == FW_FINAL) /* nothing more to obtain */
+ return;
+ if (fw_state == FW_ERR) /* already in error */
+ return;
+
+ /* fw_state is FW_EMPTY or FW_TRY */
+retry:
+ if (fw_state == FW_TRY) {
+ /*
+ * We tried the original and it failed. Move to the
+ * alternate.
+ */
+ dd_dev_warn(dd, "using alternate firmware names\n");
+ /*
+ * Let others run. Some systems, when missing firmware, does
+ * something that holds for 30 seconds. If we do that twice
+ * in a row it triggers task blocked warning.
+ */
+ cond_resched();
+ if (fw_8051_load)
+ dispose_one_firmware(&fw_8051);
+ if (fw_fabric_serdes_load)
+ dispose_one_firmware(&fw_fabric);
+ if (fw_sbus_load)
+ dispose_one_firmware(&fw_sbus);
+ if (fw_pcie_serdes_load)
+ dispose_one_firmware(&fw_pcie);
+ fw_8051_name = ALT_FW_8051_NAME_ASIC;
+ fw_fabric_serdes_name = ALT_FW_FABRIC_NAME;
+ fw_sbus_name = ALT_FW_SBUS_NAME;
+ fw_pcie_serdes_name = ALT_FW_PCIE_NAME;
+ }
+
+ if (fw_sbus_load) {
+ err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
+ if (err)
+ goto done;
+ }
+
+ if (fw_pcie_serdes_load) {
+ err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
+ if (err)
+ goto done;
+ }
+
+ if (fw_fabric_serdes_load) {
+ err = obtain_one_firmware(dd, fw_fabric_serdes_name,
+ &fw_fabric);
+ if (err)
+ goto done;
+ }
+
+ if (fw_8051_load) {
+ err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
+ if (err)
+ goto done;
+ }
+
+done:
+ if (err) {
+ /* oops, had problems obtaining a firmware */
+ if (fw_state == FW_EMPTY && dd->icode == ICODE_RTL_SILICON) {
+ /* retry with alternate (RTL only) */
+ fw_state = FW_TRY;
+ goto retry;
+ }
+ dd_dev_err(dd, "unable to obtain working firmware\n");
+ fw_state = FW_ERR;
+ fw_err = -ENOENT;
+ } else {
+ /* success */
+ if (fw_state == FW_EMPTY &&
+ dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+ fw_state = FW_TRY; /* may retry later */
+ else
+ fw_state = FW_FINAL; /* cannot try again */
+ }
+}
+
+/*
+ * Called by all HFIs when loading their firmware - i.e. device probe time.
+ * The first one will do the actual firmware load. Use a mutex to resolve
+ * any possible race condition.
+ *
+ * The call to this routine cannot be moved to driver load because the kernel
+ * call request_firmware() requires a device which is only available after
+ * the first device probe.
+ */
+static int obtain_firmware(struct hfi1_devdata *dd)
+{
+ unsigned long timeout;
+ int err = 0;
+
+ mutex_lock(&fw_mutex);
+
+ /* 40s delay due to long delay on missing firmware on some systems */
+ timeout = jiffies + msecs_to_jiffies(40000);
+ while (fw_state == FW_TRY) {
+ /*
+ * Another device is trying the firmware. Wait until it
+ * decides what works (or not).
+ */
+ if (time_after(jiffies, timeout)) {
+ /* waited too long */
+ dd_dev_err(dd, "Timeout waiting for firmware try");
+ fw_state = FW_ERR;
+ fw_err = -ETIMEDOUT;
+ break;
+ }
+ mutex_unlock(&fw_mutex);
+ msleep(20); /* arbitrary delay */
+ mutex_lock(&fw_mutex);
+ }
+ /* not in FW_TRY state */
+
+ if (fw_state == FW_FINAL) {
+ if (platform_config) {
+ dd->platform_config.data = platform_config->data;
+ dd->platform_config.size = platform_config->size;
+ }
+ goto done; /* already acquired */
+ } else if (fw_state == FW_ERR) {
+ goto done; /* already tried and failed */
+ }
+ /* fw_state is FW_EMPTY */
+
+ /* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */
+ __obtain_firmware(dd);
+
+ if (platform_config_load) {
+ platform_config = NULL;
+ err = request_firmware(&platform_config, platform_config_name,
+ &dd->pcidev->dev);
+ if (err) {
+ platform_config = NULL;
+ goto done;
+ }
+ dd->platform_config.data = platform_config->data;
+ dd->platform_config.size = platform_config->size;
+ }
+
+done:
+ mutex_unlock(&fw_mutex);
+
+ return fw_err;
+}
+
+/*
+ * Called when the driver unloads. The timing is asymmetric with its
+ * counterpart, obtain_firmware(). If called at device remove time,
+ * then it is conceivable that another device could probe while the
+ * firmware is being disposed. The mutexes can be moved to do that
+ * safely, but then the firmware would be requested from the OS multiple
+ * times.
+ *
+ * No mutex is needed as the driver is unloading and there cannot be any
+ * other callers.
+ */
+void dispose_firmware(void)
+{
+ dispose_one_firmware(&fw_8051);
+ dispose_one_firmware(&fw_fabric);
+ dispose_one_firmware(&fw_pcie);
+ dispose_one_firmware(&fw_sbus);
+
+ release_firmware(platform_config);
+ platform_config = NULL;
+
+ /* retain the error state, otherwise revert to empty */
+ if (fw_state != FW_ERR)
+ fw_state = FW_EMPTY;
+}
+
+/*
+ * Called with the result of a firmware download.
+ *
+ * Return 1 to retry loading the firmware, 0 to stop.
+ */
+static int retry_firmware(struct hfi1_devdata *dd, int load_result)
+{
+ int retry;
+
+ mutex_lock(&fw_mutex);
+
+ if (load_result == 0) {
+ /*
+ * The load succeeded, so expect all others to do the same.
+ * Do not retry again.
+ */
+ if (fw_state == FW_TRY)
+ fw_state = FW_FINAL;
+ retry = 0; /* do NOT retry */
+ } else if (fw_state == FW_TRY) {
+ /* load failed, obtain alternate firmware */
+ __obtain_firmware(dd);
+ retry = (fw_state == FW_FINAL);
+ } else {
+ /* else in FW_FINAL or FW_ERR, no retry in either case */
+ retry = 0;
+ }
+
+ mutex_unlock(&fw_mutex);
+ return retry;
+}
+
+/*
+ * Write a block of data to a given array CSR. All calls will be in
+ * multiples of 8 bytes.
+ */
+static void write_rsa_data(struct hfi1_devdata *dd, int what,
+ const u8 *data, int nbytes)
+{
+ int qw_size = nbytes / 8;
+ int i;
+
+ if (((unsigned long)data & 0x7) == 0) {
+ /* aligned */
+ u64 *ptr = (u64 *)data;
+
+ for (i = 0; i < qw_size; i++, ptr++)
+ write_csr(dd, what + (8 * i), *ptr);
+ } else {
+ /* not aligned */
+ for (i = 0; i < qw_size; i++, data += 8) {
+ u64 value;
+
+ memcpy(&value, data, 8);
+ write_csr(dd, what + (8 * i), value);
+ }
+ }
+}
+
+/*
+ * Write a block of data to a given CSR as a stream of writes. All calls will
+ * be in multiples of 8 bytes.
+ */
+static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
+ const u8 *data, int nbytes)
+{
+ u64 *ptr = (u64 *)data;
+ int qw_size = nbytes / 8;
+
+ for (; qw_size > 0; qw_size--, ptr++)
+ write_csr(dd, what, *ptr);
+}
+
+/*
+ * Download the signature and start the RSA mechanism. Wait for
+ * RSA_ENGINE_TIMEOUT before giving up.
+ */
+static int run_rsa(struct hfi1_devdata *dd, const char *who,
+ const u8 *signature)
+{
+ unsigned long timeout;
+ u64 reg;
+ u32 status;
+ int ret = 0;
+
+ /* write the signature */
+ write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
+
+ /* initialize RSA */
+ write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
+
+ /*
+ * Make sure the engine is idle and insert a delay between the two
+ * writes to MISC_CFG_RSA_CMD.
+ */
+ status = (read_csr(dd, MISC_CFG_FW_CTRL)
+ & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+ >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+ if (status != RSA_STATUS_IDLE) {
+ dd_dev_err(dd, "%s security engine not idle - giving up\n",
+ who);
+ return -EBUSY;
+ }
+
+ /* start RSA */
+ write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
+
+ /*
+ * Look for the result.
+ *
+ * The RSA engine is hooked up to two MISC errors. The driver
+ * masks these errors as they do not respond to the standard
+ * error "clear down" mechanism. Look for these errors here and
+ * clear them when possible. This routine will exit with the
+ * errors of the current run still set.
+ *
+ * MISC_FW_AUTH_FAILED_ERR
+ * Firmware authorization failed. This can be cleared by
+ * re-initializing the RSA engine, then clearing the status bit.
+ * Do not re-init the RSA angine immediately after a successful
+ * run - this will reset the current authorization.
+ *
+ * MISC_KEY_MISMATCH_ERR
+ * Key does not match. The only way to clear this is to load
+ * a matching key then clear the status bit. If this error
+ * is raised, it will persist outside of this routine until a
+ * matching key is loaded.
+ */
+ timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
+ while (1) {
+ status = (read_csr(dd, MISC_CFG_FW_CTRL)
+ & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+ >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+
+ if (status == RSA_STATUS_IDLE) {
+ /* should not happen */
+ dd_dev_err(dd, "%s firmware security bad idle state\n",
+ who);
+ ret = -EINVAL;
+ break;
+ } else if (status == RSA_STATUS_DONE) {
+ /* finished successfully */
+ break;
+ } else if (status == RSA_STATUS_FAILED) {
+ /* finished unsuccessfully */
+ ret = -EINVAL;
+ break;
+ }
+ /* else still active */
+
+ if (time_after(jiffies, timeout)) {
+ /*
+ * Timed out while active. We can't reset the engine
+ * if it is stuck active, but run through the
+ * error code to see what error bits are set.
+ */
+ dd_dev_err(dd, "%s firmware security time out\n", who);
+ ret = -ETIMEDOUT;
+ break;
+ }
+
+ msleep(20);
+ }
+
+ /*
+ * Arrive here on success or failure. Clear all RSA engine
+ * errors. All current errors will stick - the RSA logic is keeping
+ * error high. All previous errors will clear - the RSA logic
+ * is not keeping the error high.
+ */
+ write_csr(dd, MISC_ERR_CLEAR,
+ MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK |
+ MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
+ /*
+ * All that is left are the current errors. Print warnings on
+ * authorization failure details, if any. Firmware authorization
+ * can be retried, so these are only warnings.
+ */
+ reg = read_csr(dd, MISC_ERR_STATUS);
+ if (ret) {
+ if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
+ dd_dev_warn(dd, "%s firmware authorization failed\n",
+ who);
+ if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
+ dd_dev_warn(dd, "%s firmware key mismatch\n", who);
+ }
+
+ return ret;
+}
+
+static void load_security_variables(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ /* Security variables a. Write the modulus */
+ write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
+ /* Security variables b. Write the r2 */
+ write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
+ /* Security variables c. Write the mu */
+ write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
+ /* Security variables d. Write the header */
+ write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
+ (u8 *)fdet->css_header,
+ sizeof(struct css_header));
+}
+
+/* return the 8051 firmware state */
+static inline u32 get_firmware_state(struct hfi1_devdata *dd)
+{
+ u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+
+ return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
+ & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
+}
+
+/*
+ * Wait until the firmware is up and ready to take host requests.
+ * Return 0 on success, -ETIMEDOUT on timeout.
+ */
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
+{
+ unsigned long timeout;
+
+ /* in the simulator, the fake 8051 is always ready */
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+ return 0;
+
+ timeout = msecs_to_jiffies(mstimeout) + jiffies;
+ while (1) {
+ if (get_firmware_state(dd) == 0xa0) /* ready */
+ return 0;
+ if (time_after(jiffies, timeout)) /* timed out */
+ return -ETIMEDOUT;
+ usleep_range(1950, 2050); /* sleep 2ms-ish */
+ }
+}
+
+/*
+ * Load the 8051 firmware.
+ */
+static int load_8051_firmware(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ u64 reg;
+ int ret;
+ u8 ver_a, ver_b;
+
+ /*
+ * DC Reset sequence
+ * Load DC 8051 firmware
+ */
+ /*
+ * DC reset step 1: Reset DC8051
+ */
+ reg = DC_DC8051_CFG_RST_M8051W_SMASK
+ | DC_DC8051_CFG_RST_CRAM_SMASK
+ | DC_DC8051_CFG_RST_DRAM_SMASK
+ | DC_DC8051_CFG_RST_IRAM_SMASK
+ | DC_DC8051_CFG_RST_SFR_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+ /*
+ * DC reset step 2 (optional): Load 8051 data memory with link
+ * configuration
+ */
+
+ /*
+ * DC reset step 3: Load DC8051 firmware
+ */
+ /* release all but the core reset */
+ reg = DC_DC8051_CFG_RST_M8051W_SMASK;
+ write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+ /* Firmware load step 1 */
+ load_security_variables(dd, fdet);
+
+ /*
+ * Firmware load step 2. Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
+ */
+ write_csr(dd, MISC_CFG_FW_CTRL, 0);
+
+ /* Firmware load steps 3-5 */
+ ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
+ fdet->firmware_len);
+ if (ret)
+ return ret;
+
+ /*
+ * DC reset step 4. Host starts the DC8051 firmware
+ */
+ /*
+ * Firmware load step 6. Set MISC_CFG_FW_CTRL.FW_8051_LOADED
+ */
+ write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
+
+ /* Firmware load steps 7-10 */
+ ret = run_rsa(dd, "8051", fdet->signature);
+ if (ret)
+ return ret;
+
+ /* clear all reset bits, releasing the 8051 */
+ write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+
+ /*
+ * DC reset step 5. Wait for firmware to be ready to accept host
+ * requests.
+ */
+ ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+ if (ret) { /* timed out */
+ dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
+ get_firmware_state(dd));
+ return -ETIMEDOUT;
+ }
+
+ read_misc_status(dd, &ver_a, &ver_b);
+ dd_dev_info(dd, "8051 firmware version %d.%d\n",
+ (int)ver_b, (int)ver_a);
+ dd->dc8051_ver = dc8051_ver(ver_b, ver_a);
+
+ return 0;
+}
+
+/*
+ * Write the SBus request register
+ *
+ * No need for masking - the arguments are sized exactly.
+ */
+void sbus_request(struct hfi1_devdata *dd,
+ u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+ write_csr(dd, ASIC_CFG_SBUS_REQUEST,
+ ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) |
+ ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) |
+ ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) |
+ ((u64)receiver_addr <<
+ ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
+}
+
+/*
+ * Read a value from the SBus.
+ *
+ * Requires the caller to be in fast mode
+ */
+static u32 sbus_read(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr,
+ u32 data_in)
+{
+ u64 reg;
+ int retries;
+ int success = 0;
+ u32 result = 0;
+ u32 result_code = 0;
+
+ sbus_request(dd, receiver_addr, data_addr, READ_SBUS_RECEIVER, data_in);
+
+ for (retries = 0; retries < 100; retries++) {
+ usleep_range(1000, 1200); /* arbitrary */
+ reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+ result_code = (reg >> ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT)
+ & ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK;
+ if (result_code != SBUS_READ_COMPLETE)
+ continue;
+
+ success = 1;
+ result = (reg >> ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT)
+ & ASIC_STS_SBUS_RESULT_DATA_OUT_MASK;
+ break;
+ }
+
+ if (!success) {
+ dd_dev_err(dd, "%s: read failed, result code 0x%x\n", __func__,
+ result_code);
+ }
+
+ return result;
+}
+
+/*
+ * Turn off the SBus and fabric serdes spicos.
+ *
+ * + Must be called with Sbus fast mode turned on.
+ * + Must be called after fabric serdes broadcast is set up.
+ * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
+ * when using MISC_CFG_FW_CTRL.
+ */
+static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
+{
+ /* only needed on A0 */
+ if (!is_ax(dd))
+ return;
+
+ dd_dev_info(dd, "Turning off spicos:%s%s\n",
+ flags & SPICO_SBUS ? " SBus" : "",
+ flags & SPICO_FABRIC ? " fabric" : "");
+
+ write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
+ /* disable SBus spico */
+ if (flags & SPICO_SBUS)
+ sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
+ WRITE_SBUS_RECEIVER, 0x00000040);
+
+ /* disable the fabric serdes spicos */
+ if (flags & SPICO_FABRIC)
+ sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
+ 0x07, WRITE_SBUS_RECEIVER, 0x00000000);
+ write_csr(dd, MISC_CFG_FW_CTRL, 0);
+}
+
+/*
+ * Reset all of the fabric serdes for this HFI in preparation to take the
+ * link to Polling.
+ *
+ * To do a reset, we need to write to to the serdes registers. Unfortunately,
+ * the fabric serdes download to the other HFI on the ASIC will have turned
+ * off the firmware validation on this HFI. This means we can't write to the
+ * registers to reset the serdes. Work around this by performing a complete
+ * re-download and validation of the fabric serdes firmware. This, as a
+ * by-product, will reset the serdes. NOTE: the re-download requires that
+ * the 8051 be in the Offline state. I.e. not actively trying to use the
+ * serdes. This routine is called at the point where the link is Offline and
+ * is getting ready to go to Polling.
+ */
+void fabric_serdes_reset(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ if (!fw_fabric_serdes_load)
+ return;
+
+ ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+ if (ret) {
+ dd_dev_err(dd,
+ "Cannot acquire SBus resource to reset fabric SerDes - perhaps you should reboot\n");
+ return;
+ }
+ set_sbus_fast_mode(dd);
+
+ if (is_ax(dd)) {
+ /* A0 serdes do not work with a re-download */
+ u8 ra = fabric_serdes_broadcast[dd->hfi1_id];
+
+ /* place SerDes in reset and disable SPICO */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+ /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+ udelay(1);
+ /* remove SerDes reset */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+ /* turn SPICO enable on */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+ } else {
+ turn_off_spicos(dd, SPICO_FABRIC);
+ /*
+ * No need for firmware retry - what to download has already
+ * been decided.
+ * No need to pay attention to the load return - the only
+ * failure is a validation failure, which has already been
+ * checked by the initial download.
+ */
+ (void)load_fabric_serdes_firmware(dd, &fw_fabric);
+ }
+
+ clear_sbus_fast_mode(dd);
+ release_chip_resource(dd, CR_SBUS);
+}
+
+/* Access to the SBus in this routine should probably be serialized */
+int sbus_request_slow(struct hfi1_devdata *dd,
+ u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+ u64 reg, count = 0;
+
+ /* make sure fast mode is clear */
+ clear_sbus_fast_mode(dd);
+
+ sbus_request(dd, receiver_addr, data_addr, command, data_in);
+ write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+ ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
+ /* Wait for both DONE and RCV_DATA_VALID to go high */
+ reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+ while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+ (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
+ if (count++ >= SBUS_MAX_POLL_COUNT) {
+ u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+ /*
+ * If the loop has timed out, we are OK if DONE bit
+ * is set and RCV_DATA_VALID and EXECUTE counters
+ * are the same. If not, we cannot proceed.
+ */
+ if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+ (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
+ SBUS_COUNTER(counts, EXECUTE)))
+ break;
+ return -ETIMEDOUT;
+ }
+ udelay(1);
+ reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+ }
+ count = 0;
+ write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+ /* Wait for DONE to clear after EXECUTE is cleared */
+ reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+ while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
+ if (count++ >= SBUS_MAX_POLL_COUNT)
+ return -ETIME;
+ udelay(1);
+ reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+ }
+ return 0;
+}
+
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ int i, err;
+ const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
+
+ dd_dev_info(dd, "Downloading fabric firmware\n");
+
+ /* step 1: load security variables */
+ load_security_variables(dd, fdet);
+ /* step 2: place SerDes in reset and disable SPICO */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+ /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+ udelay(1);
+ /* step 3: remove SerDes reset */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+ /* step 4: assert IMEM override */
+ sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
+ /* step 5: download SerDes machine code */
+ for (i = 0; i < fdet->firmware_len; i += 4) {
+ sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
+ *(u32 *)&fdet->firmware_ptr[i]);
+ }
+ /* step 6: IMEM override off */
+ sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
+ /* step 7: turn ECC on */
+ sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+ /* steps 8-11: run the RSA engine */
+ err = run_rsa(dd, "fabric serdes", fdet->signature);
+ if (err)
+ return err;
+
+ /* step 12: turn SPICO enable on */
+ sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+ /* step 13: enable core hardware interrupts */
+ sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
+
+ return 0;
+}
+
+static int load_sbus_firmware(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ int i, err;
+ const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+ dd_dev_info(dd, "Downloading SBus firmware\n");
+
+ /* step 1: load security variables */
+ load_security_variables(dd, fdet);
+ /* step 2: place SPICO into reset and enable off */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
+ /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
+ /* step 4: set starting IMEM address for burst download */
+ sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
+ /* step 5: download the SBus Master machine code */
+ for (i = 0; i < fdet->firmware_len; i += 4) {
+ sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
+ *(u32 *)&fdet->firmware_ptr[i]);
+ }
+ /* step 6: set IMEM_CNTL_EN off */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
+ /* step 7: turn ECC on */
+ sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+ /* steps 8-11: run the RSA engine */
+ err = run_rsa(dd, "SBus", fdet->signature);
+ if (err)
+ return err;
+
+ /* step 12: set SPICO_ENABLE on */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+
+ return 0;
+}
+
+static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
+ struct firmware_details *fdet)
+{
+ int i;
+ const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+ dd_dev_info(dd, "Downloading PCIe firmware\n");
+
+ /* step 1: load security variables */
+ load_security_variables(dd, fdet);
+ /* step 2: assert single step (halts the SBus Master spico) */
+ sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
+ /* step 3: enable XDMEM access */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
+ /* step 4: load firmware into SBus Master XDMEM */
+ /*
+ * NOTE: the dmem address, write_en, and wdata are all pre-packed,
+ * we only need to pick up the bytes and write them
+ */
+ for (i = 0; i < fdet->firmware_len; i += 4) {
+ sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
+ *(u32 *)&fdet->firmware_ptr[i]);
+ }
+ /* step 5: disable XDMEM access */
+ sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+ /* step 6: allow SBus Spico to run */
+ sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
+
+ /*
+ * steps 7-11: run RSA, if it succeeds, firmware is available to
+ * be swapped
+ */
+ return run_rsa(dd, "PCIe serdes", fdet->signature);
+}
+
+/*
+ * Set the given broadcast values on the given list of devices.
+ */
+static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
+ const u8 *addrs, int count)
+{
+ while (--count >= 0) {
+ /*
+ * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
+ * defaults for everything else. Do not read-modify-write,
+ * per instruction from the manufacturer.
+ *
+ * Register 0xfd:
+ * bits what
+ * ----- ---------------------------------
+ * 0 IGNORE_BROADCAST (default 0)
+ * 11:4 BROADCAST_GROUP_1 (default 0xff)
+ * 23:16 BROADCAST_GROUP_2 (default 0xff)
+ */
+ sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
+ (u32)bg1 << 4 | (u32)bg2 << 16);
+ }
+}
+
+int acquire_hw_mutex(struct hfi1_devdata *dd)
+{
+ unsigned long timeout;
+ int try = 0;
+ u8 mask = 1 << dd->hfi1_id;
+ u8 user;
+
+retry:
+ timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
+ while (1) {
+ write_csr(dd, ASIC_CFG_MUTEX, mask);
+ user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
+ if (user == mask)
+ return 0; /* success */
+ if (time_after(jiffies, timeout))
+ break; /* timed out */
+ msleep(20);
+ }
+
+ /* timed out */
+ dd_dev_err(dd,
+ "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
+ (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
+
+ if (try == 0) {
+ /* break mutex and retry */
+ write_csr(dd, ASIC_CFG_MUTEX, 0);
+ try++;
+ goto retry;
+ }
+
+ return -EBUSY;
+}
+
+void release_hw_mutex(struct hfi1_devdata *dd)
+{
+ write_csr(dd, ASIC_CFG_MUTEX, 0);
+}
+
+/* return the given resource bit(s) as a mask for the given HFI */
+static inline u64 resource_mask(u32 hfi1_id, u32 resource)
+{
+ return ((u64)resource) << (hfi1_id ? CR_DYN_SHIFT : 0);
+}
+
+static void fail_mutex_acquire_message(struct hfi1_devdata *dd,
+ const char *func)
+{
+ dd_dev_err(dd,
+ "%s: hardware mutex stuck - suggest rebooting the machine\n",
+ func);
+}
+
+/*
+ * Acquire access to a chip resource.
+ *
+ * Return 0 on success, -EBUSY if resource busy, -EIO if mutex acquire failed.
+ */
+static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource)
+{
+ u64 scratch0, all_bits, my_bit;
+ int ret;
+
+ if (resource & CR_DYN_MASK) {
+ /* a dynamic resource is in use if either HFI has set the bit */
+ if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 &&
+ (resource & (CR_I2C1 | CR_I2C2))) {
+ /* discrete devices must serialize across both chains */
+ all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) |
+ resource_mask(1, CR_I2C1 | CR_I2C2);
+ } else {
+ all_bits = resource_mask(0, resource) |
+ resource_mask(1, resource);
+ }
+ my_bit = resource_mask(dd->hfi1_id, resource);
+ } else {
+ /* non-dynamic resources are not split between HFIs */
+ all_bits = resource;
+ my_bit = resource;
+ }
+
+ /* lock against other callers within the driver wanting a resource */
+ mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+ ret = acquire_hw_mutex(dd);
+ if (ret) {
+ fail_mutex_acquire_message(dd, __func__);
+ ret = -EIO;
+ goto done;
+ }
+
+ scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+ if (scratch0 & all_bits) {
+ ret = -EBUSY;
+ } else {
+ write_csr(dd, ASIC_CFG_SCRATCH, scratch0 | my_bit);
+ /* force write to be visible to other HFI on another OS */
+ (void)read_csr(dd, ASIC_CFG_SCRATCH);
+ }
+
+ release_hw_mutex(dd);
+
+done:
+ mutex_unlock(&dd->asic_data->asic_resource_mutex);
+ return ret;
+}
+
+/*
+ * Acquire access to a chip resource, wait up to mswait milliseconds for
+ * the resource to become available.
+ *
+ * Return 0 on success, -EBUSY if busy (even after wait), -EIO if mutex
+ * acquire failed.
+ */
+int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait)
+{
+ unsigned long timeout;
+ int ret;
+
+ timeout = jiffies + msecs_to_jiffies(mswait);
+ while (1) {
+ ret = __acquire_chip_resource(dd, resource);
+ if (ret != -EBUSY)
+ return ret;
+ /* resource is busy, check our timeout */
+ if (time_after_eq(jiffies, timeout))
+ return -EBUSY;
+ usleep_range(80, 120); /* arbitrary delay */
+ }
+}
+
+/*
+ * Release access to a chip resource
+ */
+void release_chip_resource(struct hfi1_devdata *dd, u32 resource)
+{
+ u64 scratch0, bit;
+
+ /* only dynamic resources should ever be cleared */
+ if (!(resource & CR_DYN_MASK)) {
+ dd_dev_err(dd, "%s: invalid resource 0x%x\n", __func__,
+ resource);
+ return;
+ }
+ bit = resource_mask(dd->hfi1_id, resource);
+
+ /* lock against other callers within the driver wanting a resource */
+ mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+ if (acquire_hw_mutex(dd)) {
+ fail_mutex_acquire_message(dd, __func__);
+ goto done;
+ }
+
+ scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+ if ((scratch0 & bit) != 0) {
+ scratch0 &= ~bit;
+ write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+ /* force write to be visible to other HFI on another OS */
+ (void)read_csr(dd, ASIC_CFG_SCRATCH);
+ } else {
+ dd_dev_warn(dd, "%s: id %d, resource 0x%x: bit not set\n",
+ __func__, dd->hfi1_id, resource);
+ }
+
+ release_hw_mutex(dd);
+
+done:
+ mutex_unlock(&dd->asic_data->asic_resource_mutex);
+}
+
+/*
+ * Return true if resource is set, false otherwise. Print a warning
+ * if not set and a function is supplied.
+ */
+bool check_chip_resource(struct hfi1_devdata *dd, u32 resource,
+ const char *func)
+{
+ u64 scratch0, bit;
+
+ if (resource & CR_DYN_MASK)
+ bit = resource_mask(dd->hfi1_id, resource);
+ else
+ bit = resource;
+
+ scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+ if ((scratch0 & bit) == 0) {
+ if (func)
+ dd_dev_warn(dd,
+ "%s: id %d, resource 0x%x, not acquired!\n",
+ func, dd->hfi1_id, resource);
+ return false;
+ }
+ return true;
+}
+
+static void clear_chip_resources(struct hfi1_devdata *dd, const char *func)
+{
+ u64 scratch0;
+
+ /* lock against other callers within the driver wanting a resource */
+ mutex_lock(&dd->asic_data->asic_resource_mutex);
+
+ if (acquire_hw_mutex(dd)) {
+ fail_mutex_acquire_message(dd, func);
+ goto done;
+ }
+
+ /* clear all dynamic access bits for this HFI */
+ scratch0 = read_csr(dd, ASIC_CFG_SCRATCH);
+ scratch0 &= ~resource_mask(dd->hfi1_id, CR_DYN_MASK);
+ write_csr(dd, ASIC_CFG_SCRATCH, scratch0);
+ /* force write to be visible to other HFI on another OS */
+ (void)read_csr(dd, ASIC_CFG_SCRATCH);
+
+ release_hw_mutex(dd);
+
+done:
+ mutex_unlock(&dd->asic_data->asic_resource_mutex);
+}
+
+void init_chip_resources(struct hfi1_devdata *dd)
+{
+ /* clear any holds left by us */
+ clear_chip_resources(dd, __func__);
+}
+
+void finish_chip_resources(struct hfi1_devdata *dd)
+{
+ /* clear any holds left by us */
+ clear_chip_resources(dd, __func__);
+}
+
+void set_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+ write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+ ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
+}
+
+void clear_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+ u64 reg, count = 0;
+
+ reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+ while (SBUS_COUNTER(reg, EXECUTE) !=
+ SBUS_COUNTER(reg, RCV_DATA_VALID)) {
+ if (count++ >= SBUS_MAX_POLL_COUNT)
+ break;
+ udelay(1);
+ reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+ }
+ write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+}
+
+int load_firmware(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ if (fw_fabric_serdes_load) {
+ ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+ if (ret)
+ return ret;
+
+ set_sbus_fast_mode(dd);
+
+ set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
+ fabric_serdes_broadcast[dd->hfi1_id],
+ fabric_serdes_addrs[dd->hfi1_id],
+ NUM_FABRIC_SERDES);
+ turn_off_spicos(dd, SPICO_FABRIC);
+ do {
+ ret = load_fabric_serdes_firmware(dd, &fw_fabric);
+ } while (retry_firmware(dd, ret));
+
+ clear_sbus_fast_mode(dd);
+ release_chip_resource(dd, CR_SBUS);
+ if (ret)
+ return ret;
+ }
+
+ if (fw_8051_load) {
+ do {
+ ret = load_8051_firmware(dd, &fw_8051);
+ } while (retry_firmware(dd, ret));
+ if (ret)
+ return ret;
+ }
+
+ dump_fw_version(dd);
+ return 0;
+}
+
+int hfi1_firmware_init(struct hfi1_devdata *dd)
+{
+ /* only RTL can use these */
+ if (dd->icode != ICODE_RTL_SILICON) {
+ fw_fabric_serdes_load = 0;
+ fw_pcie_serdes_load = 0;
+ fw_sbus_load = 0;
+ }
+
+ /* no 8051 or QSFP on simulator */
+ if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+ fw_8051_load = 0;
+ platform_config_load = 0;
+ }
+
+ if (!fw_8051_name) {
+ if (dd->icode == ICODE_RTL_SILICON)
+ fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
+ else
+ fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
+ }
+ if (!fw_fabric_serdes_name)
+ fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
+ if (!fw_sbus_name)
+ fw_sbus_name = DEFAULT_FW_SBUS_NAME;
+ if (!fw_pcie_serdes_name)
+ fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
+ if (!platform_config_name)
+ platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME;
+
+ return obtain_firmware(dd);
+}
+
+/*
+ * This function is a helper function for parse_platform_config(...) and
+ * does not check for validity of the platform configuration cache
+ * (because we know it is invalid as we are building up the cache).
+ * As such, this should not be called from anywhere other than
+ * parse_platform_config
+ */
+static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table)
+{
+ u32 meta_ver, meta_ver_meta, ver_start, ver_len, mask;
+ struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+ if (!system_table)
+ return -EINVAL;
+
+ meta_ver_meta =
+ *(pcfgcache->config_tables[PLATFORM_CONFIG_SYSTEM_TABLE].table_metadata
+ + SYSTEM_TABLE_META_VERSION);
+
+ mask = ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+ ver_start = meta_ver_meta & mask;
+
+ meta_ver_meta >>= METADATA_TABLE_FIELD_LEN_SHIFT;
+
+ mask = ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+ ver_len = meta_ver_meta & mask;
+
+ ver_start /= 8;
+ meta_ver = *((u8 *)system_table + ver_start) & ((1 << ver_len) - 1);
+
+ if (meta_ver < 5) {
+ dd_dev_info(
+ dd, "%s:Please update platform config\n", __func__);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int parse_platform_config(struct hfi1_devdata *dd)
+{
+ struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+ u32 *ptr = NULL;
+ u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0;
+ u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
+ int ret = -EINVAL; /* assume failure */
+
+ if (!dd->platform_config.data) {
+ dd_dev_info(dd, "%s: Missing config file\n", __func__);
+ goto bail;
+ }
+ ptr = (u32 *)dd->platform_config.data;
+
+ magic_num = *ptr;
+ ptr++;
+ if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
+ dd_dev_info(dd, "%s: Bad config file\n", __func__);
+ goto bail;
+ }
+
+ /* Field is file size in DWORDs */
+ file_length = (*ptr) * 4;
+ ptr++;
+
+ if (file_length > dd->platform_config.size) {
+ dd_dev_info(dd, "%s:File claims to be larger than read size\n",
+ __func__);
+ goto bail;
+ } else if (file_length < dd->platform_config.size) {
+ dd_dev_info(dd,
+ "%s:File claims to be smaller than read size, continuing\n",
+ __func__);
+ }
+ /* exactly equal, perfection */
+
+ /*
+ * In both cases where we proceed, using the self-reported file length
+ * is the safer option
+ */
+ while (ptr < (u32 *)(dd->platform_config.data + file_length)) {
+ header1 = *ptr;
+ header2 = *(ptr + 1);
+ if (header1 != ~header2) {
+ dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
+ __func__, (ptr - (u32 *)
+ dd->platform_config.data));
+ goto bail;
+ }
+
+ record_idx = *ptr &
+ ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
+
+ table_length_dwords = (*ptr >>
+ PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
+ ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
+
+ table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
+ ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
+
+ /* Done with this set of headers */
+ ptr += 2;
+
+ if (record_idx) {
+ /* data table */
+ switch (table_type) {
+ case PLATFORM_CONFIG_SYSTEM_TABLE:
+ pcfgcache->config_tables[table_type].num_table =
+ 1;
+ ret = check_meta_version(dd, ptr);
+ if (ret)
+ goto bail;
+ break;
+ case PLATFORM_CONFIG_PORT_TABLE:
+ pcfgcache->config_tables[table_type].num_table =
+ 2;
+ break;
+ case PLATFORM_CONFIG_RX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_TX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+ pcfgcache->config_tables[table_type].num_table =
+ table_length_dwords;
+ break;
+ default:
+ dd_dev_info(dd,
+ "%s: Unknown data table %d, offset %ld\n",
+ __func__, table_type,
+ (ptr - (u32 *)
+ dd->platform_config.data));
+ goto bail; /* We don't trust this file now */
+ }
+ pcfgcache->config_tables[table_type].table = ptr;
+ } else {
+ /* metadata table */
+ switch (table_type) {
+ case PLATFORM_CONFIG_SYSTEM_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_PORT_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_RX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_TX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+ break;
+ default:
+ dd_dev_info(dd,
+ "%s: Unknown meta table %d, offset %ld\n",
+ __func__, table_type,
+ (ptr -
+ (u32 *)dd->platform_config.data));
+ goto bail; /* We don't trust this file now */
+ }
+ pcfgcache->config_tables[table_type].table_metadata =
+ ptr;
+ }
+
+ /* Calculate and check table crc */
+ crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
+ (table_length_dwords * 4));
+ crc ^= ~(u32)0;
+
+ /* Jump the table */
+ ptr += table_length_dwords;
+ if (crc != *ptr) {
+ dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
+ __func__, (ptr -
+ (u32 *)
+ dd->platform_config.data));
+ goto bail;
+ }
+ /* Jump the CRC DWORD */
+ ptr++;
+ }
+
+ pcfgcache->cache_valid = 1;
+ return 0;
+bail:
+ memset(pcfgcache, 0, sizeof(struct platform_config_cache));
+ return ret;
+}
+
+static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
+ int field, u32 *field_len_bits,
+ u32 *field_start_bits)
+{
+ struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+ u32 *src_ptr = NULL;
+
+ if (!pcfgcache->cache_valid)
+ return -EINVAL;
+
+ switch (table) {
+ case PLATFORM_CONFIG_SYSTEM_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_PORT_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_RX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_TX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+ if (field && field < platform_config_table_limits[table])
+ src_ptr =
+ pcfgcache->config_tables[table].table_metadata + field;
+ break;
+ default:
+ dd_dev_info(dd, "%s: Unknown table\n", __func__);
+ break;
+ }
+
+ if (!src_ptr)
+ return -EINVAL;
+
+ if (field_start_bits)
+ *field_start_bits = *src_ptr &
+ ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+
+ if (field_len_bits)
+ *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
+ & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+
+ return 0;
+}
+
+/* This is the central interface to getting data out of the platform config
+ * file. It depends on parse_platform_config() having populated the
+ * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
+ * validate the sanity of the cache.
+ *
+ * The non-obvious parameters:
+ * @table_index: Acts as a look up key into which instance of the tables the
+ * relevant field is fetched from.
+ *
+ * This applies to the data tables that have multiple instances. The port table
+ * is an exception to this rule as each HFI only has one port and thus the
+ * relevant table can be distinguished by hfi_id.
+ *
+ * @data: pointer to memory that will be populated with the field requested.
+ * @len: length of memory pointed by @data in bytes.
+ */
+int get_platform_config_field(struct hfi1_devdata *dd,
+ enum platform_config_table_type_encoding
+ table_type, int table_index, int field_index,
+ u32 *data, u32 len)
+{
+ int ret = 0, wlen = 0, seek = 0;
+ u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
+ struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+ if (data)
+ memset(data, 0, len);
+ else
+ return -EINVAL;
+
+ ret = get_platform_fw_field_metadata(dd, table_type, field_index,
+ &field_len_bits,
+ &field_start_bits);
+ if (ret)
+ return -EINVAL;
+
+ /* Convert length to bits */
+ len *= 8;
+
+ /* Our metadata function checked cache_valid and field_index for us */
+ switch (table_type) {
+ case PLATFORM_CONFIG_SYSTEM_TABLE:
+ src_ptr = pcfgcache->config_tables[table_type].table;
+
+ if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
+ if (len < field_len_bits)
+ return -EINVAL;
+
+ seek = field_start_bits / 8;
+ wlen = field_len_bits / 8;
+
+ src_ptr = (u32 *)((u8 *)src_ptr + seek);
+
+ /*
+ * We expect the field to be byte aligned and whole byte
+ * lengths if we are here
+ */
+ memcpy(data, src_ptr, wlen);
+ return 0;
+ }
+ break;
+ case PLATFORM_CONFIG_PORT_TABLE:
+ /* Port table is 4 DWORDS */
+ src_ptr = dd->hfi1_id ?
+ pcfgcache->config_tables[table_type].table + 4 :
+ pcfgcache->config_tables[table_type].table;
+ break;
+ case PLATFORM_CONFIG_RX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_TX_PRESET_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+ /* fall through */
+ case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+ src_ptr = pcfgcache->config_tables[table_type].table;
+
+ if (table_index <
+ pcfgcache->config_tables[table_type].num_table)
+ src_ptr += table_index;
+ else
+ src_ptr = NULL;
+ break;
+ default:
+ dd_dev_info(dd, "%s: Unknown table\n", __func__);
+ break;
+ }
+
+ if (!src_ptr || len < field_len_bits)
+ return -EINVAL;
+
+ src_ptr += (field_start_bits / 32);
+ *data = (*src_ptr >> (field_start_bits % 32)) &
+ ((1 << field_len_bits) - 1);
+
+ return 0;
+}
+
+/*
+ * Download the firmware needed for the Gen3 PCIe SerDes. An update
+ * to the SBus firmware is needed before updating the PCIe firmware.
+ *
+ * Note: caller must be holding the SBus resource.
+ */
+int load_pcie_firmware(struct hfi1_devdata *dd)
+{
+ int ret = 0;
+
+ /* both firmware loads below use the SBus */
+ set_sbus_fast_mode(dd);
+
+ if (fw_sbus_load) {
+ turn_off_spicos(dd, SPICO_SBUS);
+ do {
+ ret = load_sbus_firmware(dd, &fw_sbus);
+ } while (retry_firmware(dd, ret));
+ if (ret)
+ goto done;
+ }
+
+ if (fw_pcie_serdes_load) {
+ dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
+ set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
+ pcie_serdes_broadcast[dd->hfi1_id],
+ pcie_serdes_addrs[dd->hfi1_id],
+ NUM_PCIE_SERDES);
+ do {
+ ret = load_pcie_serdes_firmware(dd, &fw_pcie);
+ } while (retry_firmware(dd, ret));
+ if (ret)
+ goto done;
+ }
+
+done:
+ clear_sbus_fast_mode(dd);
+
+ return ret;
+}
+
+/*
+ * Read the GUID from the hardware, store it in dd.
+ */
+void read_guid(struct hfi1_devdata *dd)
+{
+ /* Take the DC out of reset to get a valid GUID value */
+ write_csr(dd, CCE_DC_CTRL, 0);
+ (void)read_csr(dd, CCE_DC_CTRL);
+
+ dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
+ dd_dev_info(dd, "GUID %llx",
+ (unsigned long long)dd->base_guid);
+}
+
+/* read and display firmware version info */
+static void dump_fw_version(struct hfi1_devdata *dd)
+{
+ u32 pcie_vers[NUM_PCIE_SERDES];
+ u32 fabric_vers[NUM_FABRIC_SERDES];
+ u32 sbus_vers;
+ int i;
+ int all_same;
+ int ret;
+ u8 rcv_addr;
+
+ ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+ if (ret) {
+ dd_dev_err(dd, "Unable to acquire SBus to read firmware versions\n");
+ return;
+ }
+
+ /* set fast mode */
+ set_sbus_fast_mode(dd);
+
+ /* read version for SBus Master */
+ sbus_request(dd, SBUS_MASTER_BROADCAST, 0x02, WRITE_SBUS_RECEIVER, 0);
+ sbus_request(dd, SBUS_MASTER_BROADCAST, 0x07, WRITE_SBUS_RECEIVER, 0x1);
+ /* wait for interrupt to be processed */
+ usleep_range(10000, 11000);
+ sbus_vers = sbus_read(dd, SBUS_MASTER_BROADCAST, 0x08, 0x1);
+ dd_dev_info(dd, "SBus Master firmware version 0x%08x\n", sbus_vers);
+
+ /* read version for PCIe SerDes */
+ all_same = 1;
+ pcie_vers[0] = 0;
+ for (i = 0; i < NUM_PCIE_SERDES; i++) {
+ rcv_addr = pcie_serdes_addrs[dd->hfi1_id][i];
+ sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0);
+ /* wait for interrupt to be processed */
+ usleep_range(10000, 11000);
+ pcie_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0);
+ if (i > 0 && pcie_vers[0] != pcie_vers[i])
+ all_same = 0;
+ }
+
+ if (all_same) {
+ dd_dev_info(dd, "PCIe SerDes firmware version 0x%x\n",
+ pcie_vers[0]);
+ } else {
+ dd_dev_warn(dd, "PCIe SerDes do not have the same firmware version\n");
+ for (i = 0; i < NUM_PCIE_SERDES; i++) {
+ dd_dev_info(dd,
+ "PCIe SerDes lane %d firmware version 0x%x\n",
+ i, pcie_vers[i]);
+ }
+ }
+
+ /* read version for fabric SerDes */
+ all_same = 1;
+ fabric_vers[0] = 0;
+ for (i = 0; i < NUM_FABRIC_SERDES; i++) {
+ rcv_addr = fabric_serdes_addrs[dd->hfi1_id][i];
+ sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0);
+ /* wait for interrupt to be processed */
+ usleep_range(10000, 11000);
+ fabric_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0);
+ if (i > 0 && fabric_vers[0] != fabric_vers[i])
+ all_same = 0;
+ }
+
+ if (all_same) {
+ dd_dev_info(dd, "Fabric SerDes firmware version 0x%x\n",
+ fabric_vers[0]);
+ } else {
+ dd_dev_warn(dd, "Fabric SerDes do not have the same firmware version\n");
+ for (i = 0; i < NUM_FABRIC_SERDES; i++) {
+ dd_dev_info(dd,
+ "Fabric SerDes lane %d firmware version 0x%x\n",
+ i, fabric_vers[i]);
+ }
+ }
+
+ clear_sbus_fast_mode(dd);
+ release_chip_resource(dd, CR_SBUS);
+}
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
new file mode 100644
index 000000000000..325ec211370f
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -0,0 +1,2057 @@
+#ifndef _HFI1_KERNEL_H
+#define _HFI1_KERNEL_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/i2c.h>
+#include <linux/i2c-algo-bit.h>
+#include <rdma/rdma_vt.h>
+
+#include "chip_registers.h"
+#include "common.h"
+#include "verbs.h"
+#include "pio.h"
+#include "chip.h"
+#include "mad.h"
+#include "qsfp.h"
+#include "platform.h"
+#include "affinity.h"
+
+/* bumped 1 from s/w major version of TrueScale */
+#define HFI1_CHIP_VERS_MAJ 3U
+
+/* don't care about this except printing */
+#define HFI1_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define HFI1_OUI 0x001175
+#define HFI1_OUI_LSB 40
+
+#define DROP_PACKET_OFF 0
+#define DROP_PACKET_ON 1
+
+extern unsigned long hfi1_cap_mask;
+#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
+#define HFI1_CAP_UGET_MASK(mask, cap) \
+ (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
+#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
+#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
+#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
+ HFI1_CAP_MISC_MASK)
+/* Offline Disabled Reason is 4-bits */
+#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON)
+
+/*
+ * Control context is always 0 and handles the error packets.
+ * It also handles the VL15 and multicast packets.
+ */
+#define HFI1_CTRL_CTXT 0
+
+/*
+ * Driver context will store software counters for each of the events
+ * associated with these status registers
+ */
+#define NUM_CCE_ERR_STATUS_COUNTERS 41
+#define NUM_RCV_ERR_STATUS_COUNTERS 64
+#define NUM_MISC_ERR_STATUS_COUNTERS 13
+#define NUM_SEND_PIO_ERR_STATUS_COUNTERS 36
+#define NUM_SEND_DMA_ERR_STATUS_COUNTERS 4
+#define NUM_SEND_EGRESS_ERR_STATUS_COUNTERS 64
+#define NUM_SEND_ERR_STATUS_COUNTERS 3
+#define NUM_SEND_CTXT_ERR_STATUS_COUNTERS 5
+#define NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS 24
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted hfi1_statnames[] in debugfs.c must
+ * change to match.
+ */
+struct hfi1_ib_stats {
+ __u64 sps_ints; /* number of interrupts handled */
+ __u64 sps_errints; /* number of error interrupts */
+ __u64 sps_txerrs; /* tx-related packet errors */
+ __u64 sps_rcverrs; /* non-crc rcv packet errors */
+ __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+ __u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+ __u64 sps_ctxts; /* number of contexts currently open */
+ __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+ __u64 sps_buffull;
+ __u64 sps_hdrfull;
+};
+
+extern struct hfi1_ib_stats hfi1_stats;
+extern const struct pci_error_handlers hfi1_pci_err_handler;
+
+/*
+ * First-cut criterion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+#ifdef CONFIG_DEBUG_FS
+struct hfi1_opcode_stats_perctx;
+#endif
+
+struct ctxt_eager_bufs {
+ ssize_t size; /* total size of eager buffers */
+ u32 count; /* size of buffers array */
+ u32 numbufs; /* number of buffers allocated */
+ u32 alloced; /* number of rcvarray entries used */
+ u32 rcvtid_size; /* size of each eager rcv tid */
+ u32 threshold; /* head update threshold */
+ struct eager_buffer {
+ void *addr;
+ dma_addr_t phys;
+ ssize_t len;
+ } *buffers;
+ struct {
+ void *addr;
+ dma_addr_t phys;
+ } *rcvtids;
+};
+
+struct exp_tid_set {
+ struct list_head list;
+ u32 count;
+};
+
+struct hfi1_ctxtdata {
+ /* shadow the ctxt's RcvCtrl register */
+ u64 rcvctrl;
+ /* rcvhdrq base, needs mmap before useful */
+ void *rcvhdrq;
+ /* kernel virtual address where hdrqtail is updated */
+ volatile __le64 *rcvhdrtail_kvaddr;
+ /*
+ * Shared page for kernel to signal user processes that send buffers
+ * need disarming. The process should call HFI1_CMD_DISARM_BUFS
+ * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
+ */
+ unsigned long *user_event_mask;
+ /* when waiting for rcv or pioavail */
+ wait_queue_head_t wait;
+ /* rcvhdrq size (for freeing) */
+ size_t rcvhdrq_size;
+ /* number of rcvhdrq entries */
+ u16 rcvhdrq_cnt;
+ /* size of each of the rcvhdrq entries */
+ u16 rcvhdrqentsize;
+ /* mmap of hdrq, must fit in 44 bits */
+ dma_addr_t rcvhdrq_phys;
+ dma_addr_t rcvhdrqtailaddr_phys;
+ struct ctxt_eager_bufs egrbufs;
+ /* this receive context's assigned PIO ACK send context */
+ struct send_context *sc;
+
+ /* dynamic receive available interrupt timeout */
+ u32 rcvavail_timeout;
+ /*
+ * number of opens (including slave sub-contexts) on this instance
+ * (ignoring forks, dup, etc. for now)
+ */
+ int cnt;
+ /*
+ * how much space to leave at start of eager TID entries for
+ * protocol use, on each TID
+ */
+ /* instead of calculating it */
+ unsigned ctxt;
+ /* non-zero if ctxt is being shared. */
+ u16 subctxt_cnt;
+ /* non-zero if ctxt is being shared. */
+ u16 subctxt_id;
+ u8 uuid[16];
+ /* job key */
+ u16 jkey;
+ /* number of RcvArray groups for this context. */
+ u32 rcv_array_groups;
+ /* index of first eager TID entry. */
+ u32 eager_base;
+ /* number of expected TID entries */
+ u32 expected_count;
+ /* index of first expected TID entry. */
+ u32 expected_base;
+
+ struct exp_tid_set tid_group_list;
+ struct exp_tid_set tid_used_list;
+ struct exp_tid_set tid_full_list;
+
+ /* lock protecting all Expected TID data */
+ struct mutex exp_lock;
+ /* number of pio bufs for this ctxt (all procs, if shared) */
+ u32 piocnt;
+ /* first pio buffer for this ctxt */
+ u32 pio_base;
+ /* chip offset of PIO buffers for this ctxt */
+ u32 piobufs;
+ /* per-context configuration flags */
+ unsigned long flags;
+ /* per-context event flags for fileops/intr communication */
+ unsigned long event_flags;
+ /* WAIT_RCV that timed out, no interrupt */
+ u32 rcvwait_to;
+ /* WAIT_PIO that timed out, no interrupt */
+ u32 piowait_to;
+ /* WAIT_RCV already happened, no wait */
+ u32 rcvnowait;
+ /* WAIT_PIO already happened, no wait */
+ u32 pionowait;
+ /* total number of polled urgent packets */
+ u32 urgent;
+ /* saved total number of polled urgent packets for poll edge trigger */
+ u32 urgent_poll;
+ /* same size as task_struct .comm[], command that opened context */
+ char comm[TASK_COMM_LEN];
+ /* so file ops can get at unit */
+ struct hfi1_devdata *dd;
+ /* so functions that need physical port can get it easily */
+ struct hfi1_pportdata *ppd;
+ /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+ void *subctxt_uregbase;
+ /* An array of pages for the eager receive buffers * N */
+ void *subctxt_rcvegrbuf;
+ /* An array of pages for the eager header queue entries * N */
+ void *subctxt_rcvhdr_base;
+ /* The version of the library which opened this ctxt */
+ u32 userversion;
+ /* Bitmask of active slaves */
+ u32 active_slaves;
+ /* Type of packets or conditions we want to poll for */
+ u16 poll_type;
+ /* receive packet sequence counter */
+ u8 seq_cnt;
+ u8 redirect_seq_cnt;
+ /* ctxt rcvhdrq head offset */
+ u32 head;
+ u32 pkt_count;
+ /* QPs waiting for context processing */
+ struct list_head qp_wait_list;
+ /* interrupt handling */
+ u64 imask; /* clear interrupt mask */
+ int ireg; /* clear interrupt register */
+ unsigned numa_id; /* numa node of this context */
+ /* verbs stats per CTX */
+ struct hfi1_opcode_stats_perctx *opstats;
+ /*
+ * This is the kernel thread that will keep making
+ * progress on the user sdma requests behind the scenes.
+ * There is one per context (shared contexts use the master's).
+ */
+ struct task_struct *progress;
+ struct list_head sdma_queues;
+ /* protect sdma queues */
+ spinlock_t sdma_qlock;
+
+ /* Is ASPM interrupt supported for this context */
+ bool aspm_intr_supported;
+ /* ASPM state (enabled/disabled) for this context */
+ bool aspm_enabled;
+ /* Timer for re-enabling ASPM if interrupt activity quietens down */
+ struct timer_list aspm_timer;
+ /* Lock to serialize between intr, timer intr and user threads */
+ spinlock_t aspm_lock;
+ /* Is ASPM processing enabled for this context (in intr context) */
+ bool aspm_intr_enable;
+ /* Last interrupt timestamp */
+ ktime_t aspm_ts_last_intr;
+ /* Last timestamp at which we scheduled a timer for this context */
+ ktime_t aspm_ts_timer_sched;
+
+ /*
+ * The interrupt handler for a particular receive context can vary
+ * throughout it's lifetime. This is not a lock protected data member so
+ * it must be updated atomically and the prev and new value must always
+ * be valid. Worst case is we process an extra interrupt and up to 64
+ * packets with the wrong interrupt handler.
+ */
+ int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded);
+};
+
+/*
+ * Represents a single packet at a high level. Put commonly computed things in
+ * here so we do not have to keep doing them over and over. The rule of thumb is
+ * if something is used one time to derive some value, store that something in
+ * here. If it is used multiple times, then store the result of that derivation
+ * in here.
+ */
+struct hfi1_packet {
+ void *ebuf;
+ void *hdr;
+ struct hfi1_ctxtdata *rcd;
+ __le32 *rhf_addr;
+ struct rvt_qp *qp;
+ struct hfi1_other_headers *ohdr;
+ u64 rhf;
+ u32 maxcnt;
+ u32 rhqoff;
+ u32 hdrqtail;
+ int numpkt;
+ u16 tlen;
+ u16 hlen;
+ s16 etail;
+ u16 rsize;
+ u8 updegr;
+ u8 rcv_flags;
+ u8 etype;
+};
+
+/*
+ * Private data for snoop/capture support.
+ */
+struct hfi1_snoop_data {
+ int mode_flag;
+ struct cdev cdev;
+ struct device *class_dev;
+ /* protect snoop data */
+ spinlock_t snoop_lock;
+ struct list_head queue;
+ wait_queue_head_t waitq;
+ void *filter_value;
+ int (*filter_callback)(void *hdr, void *data, void *value);
+ u64 dcc_cfg; /* saved value of DCC Cfg register */
+};
+
+/* snoop mode_flag values */
+#define HFI1_PORT_SNOOP_MODE 1U
+#define HFI1_PORT_CAPTURE_MODE 2U
+
+struct rvt_sge_state;
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
+#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
+#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define HFI1_IB_CFG_SPD 5 /* current Link spd */
+#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
+#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
+#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
+#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
+#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * HFI or Host Link States
+ *
+ * These describe the states the driver thinks the logical and physical
+ * states are in. Used as an argument to set_link_state(). Implemented
+ * as bits for easy multi-state checking. The actual state can only be
+ * one.
+ */
+#define __HLS_UP_INIT_BP 0
+#define __HLS_UP_ARMED_BP 1
+#define __HLS_UP_ACTIVE_BP 2
+#define __HLS_DN_DOWNDEF_BP 3 /* link down default */
+#define __HLS_DN_POLL_BP 4
+#define __HLS_DN_DISABLE_BP 5
+#define __HLS_DN_OFFLINE_BP 6
+#define __HLS_VERIFY_CAP_BP 7
+#define __HLS_GOING_UP_BP 8
+#define __HLS_GOING_OFFLINE_BP 9
+#define __HLS_LINK_COOLDOWN_BP 10
+
+#define HLS_UP_INIT BIT(__HLS_UP_INIT_BP)
+#define HLS_UP_ARMED BIT(__HLS_UP_ARMED_BP)
+#define HLS_UP_ACTIVE BIT(__HLS_UP_ACTIVE_BP)
+#define HLS_DN_DOWNDEF BIT(__HLS_DN_DOWNDEF_BP) /* link down default */
+#define HLS_DN_POLL BIT(__HLS_DN_POLL_BP)
+#define HLS_DN_DISABLE BIT(__HLS_DN_DISABLE_BP)
+#define HLS_DN_OFFLINE BIT(__HLS_DN_OFFLINE_BP)
+#define HLS_VERIFY_CAP BIT(__HLS_VERIFY_CAP_BP)
+#define HLS_GOING_UP BIT(__HLS_GOING_UP_BP)
+#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP)
+#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP)
+
+#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
+#define HLS_DOWN ~(HLS_UP)
+
+/* use this MTU size if none other is given */
+#define HFI1_DEFAULT_ACTIVE_MTU 10240
+/* use this MTU size as the default maximum */
+#define HFI1_DEFAULT_MAX_MTU 10240
+/* default partition key */
+#define DEFAULT_PKEY 0xffff
+
+/*
+ * Possible fabric manager config parameters for fm_{get,set}_table()
+ */
+#define FM_TBL_VL_HIGH_ARB 1 /* Get/set VL high prio weights */
+#define FM_TBL_VL_LOW_ARB 2 /* Get/set VL low prio weights */
+#define FM_TBL_BUFFER_CONTROL 3 /* Get/set Buffer Control */
+#define FM_TBL_SC2VLNT 4 /* Get/set SC->VLnt */
+#define FM_TBL_VL_PREEMPT_ELEMS 5 /* Get (no set) VL preempt elems */
+#define FM_TBL_VL_PREEMPT_MATRIX 6 /* Get (no set) VL preempt matrix */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
+ */
+#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
+#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
+#define HFI1_RCVCTRL_CTXT_ENB 0x04
+#define HFI1_RCVCTRL_CTXT_DIS 0x08
+#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
+#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
+#define HFI1_RCVCTRL_PKEY_ENB 0x40 /* Note, default is enabled */
+#define HFI1_RCVCTRL_PKEY_DIS 0x80
+#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
+#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
+#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
+#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
+#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
+#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+
+/* partition enforcement flags */
+#define HFI1_PART_ENFORCE_IN 0x1
+#define HFI1_PART_ENFORCE_OUT 0x2
+
+/* how often we check for synthetic counter wrap around */
+#define SYNTH_CNT_TIME 2
+
+/* Counter flags */
+#define CNTR_NORMAL 0x0 /* Normal counters, just read register */
+#define CNTR_SYNTH 0x1 /* Synthetic counters, saturate at all 1s */
+#define CNTR_DISABLED 0x2 /* Disable this counter */
+#define CNTR_32BIT 0x4 /* Simulate 64 bits for this counter */
+#define CNTR_VL 0x8 /* Per VL counter */
+#define CNTR_SDMA 0x10
+#define CNTR_INVALID_VL -1 /* Specifies invalid VL */
+#define CNTR_MODE_W 0x0
+#define CNTR_MODE_R 0x1
+
+/* VLs Supported/Operational */
+#define HFI1_MIN_VLS_SUPPORTED 1
+#define HFI1_MAX_VLS_SUPPORTED 8
+
+static inline void incr_cntr64(u64 *cntr)
+{
+ if (*cntr < (u64)-1LL)
+ (*cntr)++;
+}
+
+static inline void incr_cntr32(u32 *cntr)
+{
+ if (*cntr < (u32)-1LL)
+ (*cntr)++;
+}
+
+#define MAX_NAME_SIZE 64
+struct hfi1_msix_entry {
+ enum irq_type type;
+ struct msix_entry msix;
+ void *arg;
+ char name[MAX_NAME_SIZE];
+ cpumask_t mask;
+};
+
+/* per-SL CCA information */
+struct cca_timer {
+ struct hrtimer hrtimer;
+ struct hfi1_pportdata *ppd; /* read-only */
+ int sl; /* read-only */
+ u16 ccti; /* read/write - current value of CCTI */
+};
+
+struct link_down_reason {
+ /*
+ * SMA-facing value. Should be set from .latest when
+ * HLS_UP_* -> HLS_DN_* transition actually occurs.
+ */
+ u8 sma;
+ u8 latest;
+};
+
+enum {
+ LO_PRIO_TABLE,
+ HI_PRIO_TABLE,
+ MAX_PRIO_TABLE
+};
+
+struct vl_arb_cache {
+ /* protect vl arb cache */
+ spinlock_t lock;
+ struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct hfi1_pportdata {
+ struct hfi1_ibport ibport_data;
+
+ struct hfi1_devdata *dd;
+ struct kobject pport_cc_kobj;
+ struct kobject sc2vl_kobj;
+ struct kobject sl2sc_kobj;
+ struct kobject vl2mtu_kobj;
+
+ /* PHY support */
+ u32 port_type;
+ struct qsfp_data qsfp_info;
+
+ /* GUID for this interface, in host order */
+ u64 guid;
+ /* GUID for peer interface, in host order */
+ u64 neighbor_guid;
+
+ /* up or down physical link state */
+ u32 linkup;
+
+ /*
+ * this address is mapped read-only into user processes so they can
+ * get status cheaply, whenever they want. One qword of status per port
+ */
+ u64 *statusp;
+
+ /* SendDMA related entries */
+
+ struct workqueue_struct *hfi1_wq;
+
+ /* move out of interrupt context */
+ struct work_struct link_vc_work;
+ struct work_struct link_up_work;
+ struct work_struct link_down_work;
+ struct work_struct sma_message_work;
+ struct work_struct freeze_work;
+ struct work_struct link_downgrade_work;
+ struct work_struct link_bounce_work;
+ struct delayed_work start_link_work;
+ /* host link state variables */
+ struct mutex hls_lock;
+ u32 host_link_state;
+
+ spinlock_t sdma_alllock ____cacheline_aligned_in_smp;
+
+ u32 lstate; /* logical link state */
+
+ /* these are the "32 bit" regs */
+
+ u32 ibmtu; /* The MTU programmed for this unit */
+ /*
+ * Current max size IB packet (in bytes) including IB headers, that
+ * we can send. Changes when ibmtu changes.
+ */
+ u32 ibmaxlen;
+ u32 current_egress_rate; /* units [10^6 bits/sec] */
+ /* LID programmed for this instance */
+ u16 lid;
+ /* list of pkeys programmed; 0 if not set */
+ u16 pkeys[MAX_PKEY_VALUES];
+ u16 link_width_supported;
+ u16 link_width_downgrade_supported;
+ u16 link_speed_supported;
+ u16 link_width_enabled;
+ u16 link_width_downgrade_enabled;
+ u16 link_speed_enabled;
+ u16 link_width_active;
+ u16 link_width_downgrade_tx_active;
+ u16 link_width_downgrade_rx_active;
+ u16 link_speed_active;
+ u8 vls_supported;
+ u8 vls_operational;
+ u8 actual_vls_operational;
+ /* LID mask control */
+ u8 lmc;
+ /* Rx Polarity inversion (compensate for ~tx on partner) */
+ u8 rx_pol_inv;
+
+ u8 hw_pidx; /* physical port index */
+ u8 port; /* IB port number and index into dd->pports - 1 */
+ /* type of neighbor node */
+ u8 neighbor_type;
+ u8 neighbor_normal;
+ u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
+ u8 neighbor_port_number;
+ u8 is_sm_config_started;
+ u8 offline_disabled_reason;
+ u8 is_active_optimize_enabled;
+ u8 driver_link_ready; /* driver ready for active link */
+ u8 link_enabled; /* link enabled? */
+ u8 linkinit_reason;
+ u8 local_tx_rate; /* rate given to 8051 firmware */
+ u8 last_pstate; /* info only */
+ u8 qsfp_retry_count;
+
+ /* placeholders for IB MAD packet settings */
+ u8 overrun_threshold;
+ u8 phy_error_threshold;
+
+ /* Used to override LED behavior for things like maintenance beaconing*/
+ /*
+ * Alternates per phase of blink
+ * [0] holds LED off duration, [1] holds LED on duration
+ */
+ unsigned long led_override_vals[2];
+ u8 led_override_phase; /* LSB picks from vals[] */
+ atomic_t led_override_timer_active;
+ /* Used to flash LEDs in override mode */
+ struct timer_list led_override_timer;
+
+ u32 sm_trap_qp;
+ u32 sa_qp;
+
+ /*
+ * cca_timer_lock protects access to the per-SL cca_timer
+ * structures (specifically the ccti member).
+ */
+ spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
+ struct cca_timer cca_timer[OPA_MAX_SLS];
+
+ /* List of congestion control table entries */
+ struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
+
+ /* congestion entries, each entry corresponding to a SL */
+ struct opa_congestion_setting_entry_shadow
+ congestion_entries[OPA_MAX_SLS];
+
+ /*
+ * cc_state_lock protects (write) access to the per-port
+ * struct cc_state.
+ */
+ spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
+
+ struct cc_state __rcu *cc_state;
+
+ /* Total number of congestion control table entries */
+ u16 total_cct_entry;
+
+ /* Bit map identifying service level */
+ u32 cc_sl_control_map;
+
+ /* CA's max number of 64 entry units in the congestion control table */
+ u8 cc_max_table_entries;
+
+ /*
+ * begin congestion log related entries
+ * cc_log_lock protects all congestion log related data
+ */
+ spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
+ u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
+ u16 threshold_event_counter;
+ struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
+ int cc_log_idx; /* index for logging events */
+ int cc_mad_idx; /* index for reporting events */
+ /* end congestion log related entries */
+
+ struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
+
+ /* port relative counter buffer */
+ u64 *cntrs;
+ /* port relative synthetic counter buffer */
+ u64 *scntrs;
+ /* port_xmit_discards are synthesized from different egress errors */
+ u64 port_xmit_discards;
+ u64 port_xmit_discards_vl[C_VL_COUNT];
+ u64 port_xmit_constraint_errors;
+ u64 port_rcv_constraint_errors;
+ /* count of 'link_err' interrupts from DC */
+ u64 link_downed;
+ /* number of times link retrained successfully */
+ u64 link_up;
+ /* number of times a link unknown frame was reported */
+ u64 unknown_frame_count;
+ /* port_ltp_crc_mode is returned in 'portinfo' MADs */
+ u16 port_ltp_crc_mode;
+ /* port_crc_mode_enabled is the crc we support */
+ u8 port_crc_mode_enabled;
+ /* mgmt_allowed is also returned in 'portinfo' MADs */
+ u8 mgmt_allowed;
+ u8 part_enforce; /* partition enforcement flags */
+ struct link_down_reason local_link_down_reason;
+ struct link_down_reason neigh_link_down_reason;
+ /* Value to be sent to link peer on LinkDown .*/
+ u8 remote_link_down_reason;
+ /* Error events that will cause a port bounce. */
+ u32 port_error_action;
+ struct work_struct linkstate_active_work;
+ /* Does this port need to prescan for FECNs */
+ bool cc_prescan;
+};
+
+typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+typedef void (*opcode_handler)(struct hfi1_packet *packet);
+
+/* return values for the RHF receive functions */
+#define RHF_RCV_CONTINUE 0 /* keep going */
+#define RHF_RCV_DONE 1 /* stop, this packet processed */
+#define RHF_RCV_REPROCESS 2 /* stop. retain this packet */
+
+struct rcv_array_data {
+ u8 group_size;
+ u16 ngroups;
+ u16 nctxt_extra;
+};
+
+struct per_vl_data {
+ u16 mtu;
+ struct send_context *sc;
+};
+
+/* 16 to directly index */
+#define PER_VL_SEND_CONTEXTS 16
+
+struct err_info_rcvport {
+ u8 status_and_code;
+ u64 packet_flit1;
+ u64 packet_flit2;
+};
+
+struct err_info_constraint {
+ u8 status;
+ u16 pkey;
+ u32 slid;
+};
+
+struct hfi1_temp {
+ unsigned int curr; /* current temperature */
+ unsigned int lo_lim; /* low temperature limit */
+ unsigned int hi_lim; /* high temperature limit */
+ unsigned int crit_lim; /* critical temperature limit */
+ u8 triggers; /* temperature triggers */
+};
+
+struct hfi1_i2c_bus {
+ struct hfi1_devdata *controlling_dd; /* current controlling device */
+ struct i2c_adapter adapter; /* bus details */
+ struct i2c_algo_bit_data algo; /* bus algorithm details */
+ int num; /* bus number, 0 or 1 */
+};
+
+/* common data between shared ASIC HFIs */
+struct hfi1_asic_data {
+ struct hfi1_devdata *dds[2]; /* back pointers */
+ struct mutex asic_resource_mutex;
+ struct hfi1_i2c_bus *i2c_bus0;
+ struct hfi1_i2c_bus *i2c_bus1;
+};
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a hfi1_pportdata struct.
+ */
+struct sdma_engine;
+struct sdma_vl_map;
+
+#define BOARD_VERS_MAX 96 /* how long the version string can be */
+#define SERIAL_MAX 16 /* length of the serial number */
+
+typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
+struct hfi1_devdata {
+ struct hfi1_ibdev verbs_dev; /* must be first */
+ struct list_head list;
+ /* pointers to related structs for this device */
+ /* pci access data structure */
+ struct pci_dev *pcidev;
+ struct cdev user_cdev;
+ struct cdev diag_cdev;
+ struct cdev ui_cdev;
+ struct device *user_device;
+ struct device *diag_device;
+ struct device *ui_device;
+
+ /* mem-mapped pointer to base of chip regs */
+ u8 __iomem *kregbase;
+ /* end of mem-mapped chip space excluding sendbuf and user regs */
+ u8 __iomem *kregend;
+ /* physical address of chip for io_remap, etc. */
+ resource_size_t physaddr;
+ /* receive context data */
+ struct hfi1_ctxtdata **rcd;
+ /* send context data */
+ struct send_context_info *send_contexts;
+ /* map hardware send contexts to software index */
+ u8 *hw_to_sw;
+ /* spinlock for allocating and releasing send context resources */
+ spinlock_t sc_lock;
+ /* Per VL data. Enough for all VLs but not all elements are set/used. */
+ struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
+ /* lock for pio_map */
+ spinlock_t pio_map_lock;
+ /* array of kernel send contexts */
+ struct send_context **kernel_send_context;
+ /* array of vl maps */
+ struct pio_vl_map __rcu *pio_map;
+ /* seqlock for sc2vl */
+ seqlock_t sc2vl_lock;
+ u64 sc2vl[4];
+ /* Send Context initialization lock. */
+ spinlock_t sc_init_lock;
+
+ /* fields common to all SDMA engines */
+
+ /* default flags to last descriptor */
+ u64 default_desc1;
+ volatile __le64 *sdma_heads_dma; /* DMA'ed by chip */
+ dma_addr_t sdma_heads_phys;
+ void *sdma_pad_dma; /* DMA'ed by chip */
+ dma_addr_t sdma_pad_phys;
+ /* for deallocation */
+ size_t sdma_heads_size;
+ /* number from the chip */
+ u32 chip_sdma_engines;
+ /* num used */
+ u32 num_sdma;
+ /* lock for sdma_map */
+ spinlock_t sde_map_lock;
+ /* array of engines sized by num_sdma */
+ struct sdma_engine *per_sdma;
+ /* array of vl maps */
+ struct sdma_vl_map __rcu *sdma_map;
+ /* SPC freeze waitqueue and variable */
+ wait_queue_head_t sdma_unfreeze_wq;
+ atomic_t sdma_unfreeze_count;
+
+ /* common data between shared ASIC HFIs in this OS */
+ struct hfi1_asic_data *asic_data;
+
+ /* hfi1_pportdata, points to array of (physical) port-specific
+ * data structs, indexed by pidx (0..n-1)
+ */
+ struct hfi1_pportdata *pport;
+
+ /* mem-mapped pointer to base of PIO buffers */
+ void __iomem *piobase;
+ /*
+ * write-combining mem-mapped pointer to base of RcvArray
+ * memory.
+ */
+ void __iomem *rcvarray_wc;
+ /*
+ * credit return base - a per-NUMA range of DMA address that
+ * the chip will use to update the per-context free counter
+ */
+ struct credit_return_base *cr_base;
+
+ /* send context numbers and sizes for each type */
+ struct sc_config_sizes sc_sizes[SC_MAX];
+
+ u32 lcb_access_count; /* count of LCB users */
+
+ char *boardname; /* human readable board info */
+
+ /* device (not port) flags, basically device capabilities */
+ u32 flags;
+
+ /* reset value */
+ u64 z_int_counter;
+ u64 z_rcv_limit;
+ u64 z_send_schedule;
+ /* percpu int_counter */
+ u64 __percpu *int_counter;
+ u64 __percpu *rcv_limit;
+ u64 __percpu *send_schedule;
+ /* number of receive contexts in use by the driver */
+ u32 num_rcv_contexts;
+ /* number of pio send contexts in use by the driver */
+ u32 num_send_contexts;
+ /*
+ * number of ctxts available for PSM open
+ */
+ u32 freectxts;
+ /* total number of available user/PSM contexts */
+ u32 num_user_contexts;
+ /* base receive interrupt timeout, in CSR units */
+ u32 rcv_intr_timeout_csr;
+
+ u64 __iomem *egrtidbase;
+ spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
+ spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
+ /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
+ spinlock_t uctxt_lock; /* rcd and user context changes */
+ /* exclusive access to 8051 */
+ spinlock_t dc8051_lock;
+ /* exclusive access to 8051 memory */
+ spinlock_t dc8051_memlock;
+ int dc8051_timed_out; /* remember if the 8051 timed out */
+ /*
+ * A page that will hold event notification bitmaps for all
+ * contexts. This page will be mapped into all processes.
+ */
+ unsigned long *events;
+ /*
+ * per unit status, see also portdata statusp
+ * mapped read-only into user processes so they can get unit and
+ * IB link status cheaply
+ */
+ struct hfi1_status *status;
+ u32 freezelen; /* max length of freezemsg */
+
+ /* revision register shadow */
+ u64 revision;
+ /* Base GUID for device (network order) */
+ u64 base_guid;
+
+ /* these are the "32 bit" regs */
+
+ /* value we put in kr_rcvhdrsize */
+ u32 rcvhdrsize;
+ /* number of receive contexts the chip supports */
+ u32 chip_rcv_contexts;
+ /* number of receive array entries */
+ u32 chip_rcv_array_count;
+ /* number of PIO send contexts the chip supports */
+ u32 chip_send_contexts;
+ /* number of bytes in the PIO memory buffer */
+ u32 chip_pio_mem_size;
+ /* number of bytes in the SDMA memory buffer */
+ u32 chip_sdma_mem_size;
+
+ /* size of each rcvegrbuffer */
+ u32 rcvegrbufsize;
+ /* log2 of above */
+ u16 rcvegrbufsize_shift;
+ /* both sides of the PCIe link are gen3 capable */
+ u8 link_gen3_capable;
+ /* localbus width (1, 2,4,8,16,32) from config space */
+ u32 lbus_width;
+ /* localbus speed in MHz */
+ u32 lbus_speed;
+ int unit; /* unit # of this chip */
+ int node; /* home node of this chip */
+
+ /* save these PCI fields to restore after a reset */
+ u32 pcibar0;
+ u32 pcibar1;
+ u32 pci_rom;
+ u16 pci_command;
+ u16 pcie_devctl;
+ u16 pcie_lnkctl;
+ u16 pcie_devctl2;
+ u32 pci_msix0;
+ u32 pci_lnkctl3;
+ u32 pci_tph2;
+
+ /*
+ * ASCII serial number, from flash, large enough for original
+ * all digit strings, and longer serial number format
+ */
+ u8 serial[SERIAL_MAX];
+ /* human readable board version */
+ u8 boardversion[BOARD_VERS_MAX];
+ u8 lbus_info[32]; /* human readable localbus info */
+ /* chip major rev, from CceRevision */
+ u8 majrev;
+ /* chip minor rev, from CceRevision */
+ u8 minrev;
+ /* hardware ID */
+ u8 hfi1_id;
+ /* implementation code */
+ u8 icode;
+ /* default link down value (poll/sleep) */
+ u8 link_default;
+ /* vAU of this device */
+ u8 vau;
+ /* vCU of this device */
+ u8 vcu;
+ /* link credits of this device */
+ u16 link_credits;
+ /* initial vl15 credits to use */
+ u16 vl15_init;
+
+ /* Misc small ints */
+ /* Number of physical ports available */
+ u8 num_pports;
+ /* Lowest context number which can be used by user processes */
+ u8 first_user_ctxt;
+ u8 n_krcv_queues;
+ u8 qos_shift;
+ u8 qpn_mask;
+
+ u16 rhf_offset; /* offset of RHF within receive header entry */
+ u16 irev; /* implementation revision */
+ u16 dc8051_ver; /* 8051 firmware version */
+
+ struct platform_config platform_config;
+ struct platform_config_cache pcfg_cache;
+
+ struct diag_client *diag_client;
+ spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
+
+ u8 psxmitwait_supported;
+ /* cycle length of PS* counters in HW (in picoseconds) */
+ u16 psxmitwait_check_rate;
+ /* high volume overflow errors deferred to tasklet */
+ struct tasklet_struct error_tasklet;
+
+ /* MSI-X information */
+ struct hfi1_msix_entry *msix_entries;
+ u32 num_msix_entries;
+
+ /* INTx information */
+ u32 requested_intx_irq; /* did we request one? */
+ char intx_name[MAX_NAME_SIZE]; /* INTx name */
+
+ /* general interrupt: mask of handled interrupts */
+ u64 gi_mask[CCE_NUM_INT_CSRS];
+
+ struct rcv_array_data rcv_entries;
+
+ /*
+ * 64 bit synthetic counters
+ */
+ struct timer_list synth_stats_timer;
+
+ /*
+ * device counters
+ */
+ char *cntrnames;
+ size_t cntrnameslen;
+ size_t ndevcntrs;
+ u64 *cntrs;
+ u64 *scntrs;
+
+ /*
+ * remembered values for synthetic counters
+ */
+ u64 last_tx;
+ u64 last_rx;
+
+ /*
+ * per-port counters
+ */
+ size_t nportcntrs;
+ char *portcntrnames;
+ size_t portcntrnameslen;
+
+ struct hfi1_snoop_data hfi1_snoop;
+
+ struct err_info_rcvport err_info_rcvport;
+ struct err_info_constraint err_info_rcv_constraint;
+ struct err_info_constraint err_info_xmit_constraint;
+ u8 err_info_uncorrectable;
+ u8 err_info_fmconfig;
+
+ atomic_t drop_packet;
+ u8 do_drop;
+
+ /*
+ * Software counters for the status bits defined by the
+ * associated error status registers
+ */
+ u64 cce_err_status_cnt[NUM_CCE_ERR_STATUS_COUNTERS];
+ u64 rcv_err_status_cnt[NUM_RCV_ERR_STATUS_COUNTERS];
+ u64 misc_err_status_cnt[NUM_MISC_ERR_STATUS_COUNTERS];
+ u64 send_pio_err_status_cnt[NUM_SEND_PIO_ERR_STATUS_COUNTERS];
+ u64 send_dma_err_status_cnt[NUM_SEND_DMA_ERR_STATUS_COUNTERS];
+ u64 send_egress_err_status_cnt[NUM_SEND_EGRESS_ERR_STATUS_COUNTERS];
+ u64 send_err_status_cnt[NUM_SEND_ERR_STATUS_COUNTERS];
+
+ /* Software counter that spans all contexts */
+ u64 sw_ctxt_err_status_cnt[NUM_SEND_CTXT_ERR_STATUS_COUNTERS];
+ /* Software counter that spans all DMA engines */
+ u64 sw_send_dma_eng_err_status_cnt[
+ NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS];
+ /* Software counter that aggregates all cce_err_status errors */
+ u64 sw_cce_err_status_aggregate;
+ /* Software counter that aggregates all bypass packet rcv errors */
+ u64 sw_rcv_bypass_packet_errors;
+ /* receive interrupt functions */
+ rhf_rcv_function_ptr *rhf_rcv_function_map;
+ rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
+
+ /*
+ * Handlers for outgoing data so that snoop/capture does not
+ * have to have its hooks in the send path
+ */
+ send_routine process_pio_send;
+ send_routine process_dma_send;
+ void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+ u64 pbc, const void *from, size_t count);
+
+ /* OUI comes from the HW. Used everywhere as 3 separate bytes. */
+ u8 oui1;
+ u8 oui2;
+ u8 oui3;
+ /* Timer and counter used to detect RcvBufOvflCnt changes */
+ struct timer_list rcverr_timer;
+ u32 rcv_ovfl_cnt;
+
+ wait_queue_head_t event_queue;
+
+ /* Save the enabled LCB error bits */
+ u64 lcb_err_en;
+ u8 dc_shutdown;
+
+ /* receive context tail dummy address */
+ __le64 *rcvhdrtail_dummy_kvaddr;
+ dma_addr_t rcvhdrtail_dummy_physaddr;
+
+ bool eprom_available; /* true if EPROM is available for this device */
+ bool aspm_supported; /* Does HW support ASPM */
+ bool aspm_enabled; /* ASPM state: enabled/disabled */
+ /* Serialize ASPM enable/disable between multiple verbs contexts */
+ spinlock_t aspm_lock;
+ /* Number of verbs contexts which have disabled ASPM */
+ atomic_t aspm_disabled_cnt;
+
+ struct hfi1_affinity *affinity;
+ struct kobject kobj;
+};
+
+/* 8051 firmware version helper */
+#define dc8051_ver(a, b) ((a) << 8 | (b))
+#define dc8051_ver_maj(a) ((a & 0xff00) >> 8)
+#define dc8051_ver_min(a) (a & 0x00ff)
+
+/* f_put_tid types */
+#define PT_EXPECTED 0
+#define PT_EAGER 1
+#define PT_INVALID 2
+
+struct tid_rb_node;
+struct mmu_rb_node;
+struct mmu_rb_handler;
+
+/* Private data for file operations */
+struct hfi1_filedata {
+ struct hfi1_ctxtdata *uctxt;
+ unsigned subctxt;
+ struct hfi1_user_sdma_comp_q *cq;
+ struct hfi1_user_sdma_pkt_q *pq;
+ /* for cpu affinity; -1 if none */
+ int rec_cpu_num;
+ u32 tid_n_pinned;
+ struct mmu_rb_handler *handler;
+ struct tid_rb_node **entry_to_rb;
+ spinlock_t tid_lock; /* protect tid_[limit,used] counters */
+ u32 tid_limit;
+ u32 tid_used;
+ u32 *invalid_tids;
+ u32 invalid_tid_idx;
+ /* protect invalid_tids array and invalid_tid_idx */
+ spinlock_t invalid_lock;
+ struct mm_struct *mm;
+};
+
+extern struct list_head hfi1_dev_list;
+extern spinlock_t hfi1_devs_lock;
+struct hfi1_devdata *hfi1_lookup(int unit);
+extern u32 hfi1_cpulist_count;
+extern unsigned long *hfi1_cpulist;
+
+extern unsigned int snoop_drop_send;
+extern unsigned int snoop_force_capture;
+int hfi1_init(struct hfi1_devdata *, int);
+int hfi1_count_units(int *npresentp, int *nupp);
+int hfi1_count_active_units(void);
+
+int hfi1_diag_add(struct hfi1_devdata *);
+void hfi1_diag_remove(struct hfi1_devdata *);
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
+
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
+
+int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *);
+int hfi1_create_ctxts(struct hfi1_devdata *dd);
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32, int);
+void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
+ struct hfi1_devdata *, u8, u8);
+void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+
+int handle_receive_interrupt(struct hfi1_ctxtdata *, int);
+int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int);
+int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int);
+void set_all_slowpath(struct hfi1_devdata *dd);
+
+extern const struct pci_device_id hfi1_pci_tbl[];
+
+/* receive packet handler dispositions */
+#define RCV_PKT_OK 0x0 /* keep going */
+#define RCV_PKT_LIMIT 0x1 /* stop, hit limit, start thread */
+#define RCV_PKT_DONE 0x2 /* stop, no more packets detected */
+
+/* calculate the current RHF address */
+static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd)
+{
+ return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->dd->rhf_offset;
+}
+
+int hfi1_reset_device(int);
+
+/* return the driver's idea of the logical OPA port state */
+static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
+{
+ return ppd->lstate; /* use the cached value */
+}
+
+void receive_interrupt_work(struct work_struct *work);
+
+/* extract service channel from header and rhf */
+static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
+{
+ return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
+ ((!!(rhf_dc_info(rhf))) << 4);
+}
+
+#define HFI1_JKEY_WIDTH 16
+#define HFI1_JKEY_MASK (BIT(16) - 1)
+#define HFI1_ADMIN_JKEY_RANGE 32
+
+/*
+ * J_KEYs are split and allocated in the following groups:
+ * 0 - 31 - users with administrator privileges
+ * 32 - 63 - kernel protocols using KDETH packets
+ * 64 - 65535 - all other users using KDETH packets
+ */
+static inline u16 generate_jkey(kuid_t uid)
+{
+ u16 jkey = from_kuid(current_user_ns(), uid) & HFI1_JKEY_MASK;
+
+ if (capable(CAP_SYS_ADMIN))
+ jkey &= HFI1_ADMIN_JKEY_RANGE - 1;
+ else if (jkey < 64)
+ jkey |= BIT(HFI1_JKEY_WIDTH - 1);
+
+ return jkey;
+}
+
+/*
+ * active_egress_rate
+ *
+ * returns the active egress rate in units of [10^6 bits/sec]
+ */
+static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
+{
+ u16 link_speed = ppd->link_speed_active;
+ u16 link_width = ppd->link_width_active;
+ u32 egress_rate;
+
+ if (link_speed == OPA_LINK_SPEED_25G)
+ egress_rate = 25000;
+ else /* assume OPA_LINK_SPEED_12_5G */
+ egress_rate = 12500;
+
+ switch (link_width) {
+ case OPA_LINK_WIDTH_4X:
+ egress_rate *= 4;
+ break;
+ case OPA_LINK_WIDTH_3X:
+ egress_rate *= 3;
+ break;
+ case OPA_LINK_WIDTH_2X:
+ egress_rate *= 2;
+ break;
+ default:
+ /* assume IB_WIDTH_1X */
+ break;
+ }
+
+ return egress_rate;
+}
+
+/*
+ * egress_cycles
+ *
+ * Returns the number of 'fabric clock cycles' to egress a packet
+ * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
+ * rate is (approximately) 805 MHz, the units of the returned value
+ * are (1/805 MHz).
+ */
+static inline u32 egress_cycles(u32 len, u32 rate)
+{
+ u32 cycles;
+
+ /*
+ * cycles is:
+ *
+ * (length) [bits] / (rate) [bits/sec]
+ * ---------------------------------------------------
+ * fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
+ */
+
+ cycles = len * 8; /* bits */
+ cycles *= 805;
+ cycles /= rate;
+
+ return cycles;
+}
+
+void set_link_ipg(struct hfi1_pportdata *ppd);
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
+ u32 rqpn, u8 svc_type);
+void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
+ u32 pkey, u32 slid, u32 dlid, u8 sc5,
+ const struct ib_grh *old_grh);
+#define PKEY_CHECK_INVALID -1
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+ u8 sc5, int8_t s_pkey_index);
+
+#define PACKET_EGRESS_TIMEOUT 350
+static inline void pause_for_credit_return(struct hfi1_devdata *dd)
+{
+ /* Pause at least 1us, to ensure chip returns all credits */
+ u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
+
+ udelay(usec ? usec : 1);
+}
+
+/**
+ * sc_to_vlt() reverse lookup sc to vl
+ * @dd - devdata
+ * @sc5 - 5 bit sc
+ */
+static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
+{
+ unsigned seq;
+ u8 rval;
+
+ if (sc5 >= OPA_MAX_SCS)
+ return (u8)(0xff);
+
+ do {
+ seq = read_seqbegin(&dd->sc2vl_lock);
+ rval = *(((u8 *)dd->sc2vl) + sc5);
+ } while (read_seqretry(&dd->sc2vl_lock, seq));
+
+ return rval;
+}
+
+#define PKEY_MEMBER_MASK 0x8000
+#define PKEY_LOW_15_MASK 0x7fff
+
+/*
+ * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for ingress partition keys
+ * specified in the OPAv1 spec., section 9.10.14.
+ */
+static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+ u16 mkey = pkey & PKEY_LOW_15_MASK;
+ u16 ment = ent & PKEY_LOW_15_MASK;
+
+ if (mkey == ment) {
+ /*
+ * If pkey[15] is clear (limited partition member),
+ * is bit 15 in the corresponding table element
+ * clear (limited member)?
+ */
+ if (!(pkey & PKEY_MEMBER_MASK))
+ return !!(ent & PKEY_MEMBER_MASK);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * ingress_pkey_table_search - search the entire pkey table for
+ * an entry which matches 'pkey'. return 0 if a match is found,
+ * and 1 otherwise.
+ */
+static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
+{
+ int i;
+
+ for (i = 0; i < MAX_PKEY_VALUES; i++) {
+ if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * ingress_pkey_table_fail - record a failure of ingress pkey validation,
+ * i.e., increment port_rcv_constraint_errors for the port, and record
+ * the 'error info' for this failure.
+ */
+static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
+ u16 slid)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+
+ incr_cntr64(&ppd->port_rcv_constraint_errors);
+ if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
+ dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
+ dd->err_info_rcv_constraint.slid = slid;
+ dd->err_info_rcv_constraint.pkey = pkey;
+ }
+}
+
+/*
+ * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
+ * is a hint as to the best place in the partition key table to begin
+ * searching. This function should not be called on the data path because
+ * of performance reasons. On datapath pkey check is expected to be done
+ * by HW and rcv_pkey_check function should be called instead.
+ */
+static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+ u8 sc5, u8 idx, u16 slid)
+{
+ if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+ return 0;
+
+ /* If SC15, pkey[0:14] must be 0x7fff */
+ if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+ goto bad;
+
+ /* Is the pkey = 0x0, or 0x8000? */
+ if ((pkey & PKEY_LOW_15_MASK) == 0)
+ goto bad;
+
+ /* The most likely matching pkey has index 'idx' */
+ if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
+ return 0;
+
+ /* no match - try the whole table */
+ if (!ingress_pkey_table_search(ppd, pkey))
+ return 0;
+
+bad:
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+}
+
+/*
+ * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. It only ensures pkey is vlid for QP0. This function
+ * should be called on the data path instead of ingress_pkey_check
+ * as on data path, pkey check is done by HW (except for QP0).
+ */
+static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+ u8 sc5, u16 slid)
+{
+ if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+ return 0;
+
+ /* If SC15, pkey[0:14] must be 0x7fff */
+ if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+ goto bad;
+
+ return 0;
+bad:
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+}
+
+/* MTU handling */
+
+/* MTU enumeration, 256-4k match IB */
+#define OPA_MTU_0 0
+#define OPA_MTU_256 1
+#define OPA_MTU_512 2
+#define OPA_MTU_1024 3
+#define OPA_MTU_2048 4
+#define OPA_MTU_4096 5
+
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
+int mtu_to_enum(u32 mtu, int default_if_bad);
+u16 enum_to_mtu(int);
+static inline int valid_ib_mtu(unsigned int mtu)
+{
+ return mtu == 256 || mtu == 512 ||
+ mtu == 1024 || mtu == 2048 ||
+ mtu == 4096;
+}
+
+static inline int valid_opa_max_mtu(unsigned int mtu)
+{
+ return mtu >= 2048 &&
+ (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
+}
+
+int set_mtu(struct hfi1_pportdata *);
+
+int hfi1_set_lid(struct hfi1_pportdata *, u32, u8);
+void hfi1_disable_after_error(struct hfi1_devdata *);
+int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int);
+int hfi1_rcvbuf_validate(u32, u8, u16 *);
+
+int fm_get_table(struct hfi1_pportdata *, int, void *);
+int fm_set_table(struct hfi1_pportdata *, int, void *);
+
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf);
+void reset_link_credits(struct hfi1_devdata *dd);
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
+
+int snoop_recv_handler(struct hfi1_packet *packet);
+int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ u64 pbc);
+int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ u64 pbc);
+void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+ u64 pbc, const void *from, size_t count);
+int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc);
+
+static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
+{
+ return ppd->dd;
+}
+
+static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
+{
+ return container_of(dev, struct hfi1_devdata, verbs_dev);
+}
+
+static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+ return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
+{
+ return container_of(ibp, struct hfi1_pportdata, ibport_data);
+}
+
+static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi)
+{
+ return container_of(rdi, struct hfi1_ibdev, rdi);
+}
+
+static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+ WARN_ON(pidx >= dd->num_pports);
+ return &dd->pport[pidx].ibport_data;
+}
+
+void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+ bool do_cnp);
+static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt,
+ bool do_cnp)
+{
+ struct hfi1_other_headers *ohdr = pkt->ohdr;
+ u32 bth1;
+
+ bth1 = be32_to_cpu(ohdr->bth[1]);
+ if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+ hfi1_process_ecn_slowpath(qp, pkt, do_cnp);
+ return bth1 & HFI1_FECN_SMASK;
+ }
+ return false;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u16 ret;
+
+ if (index >= ARRAY_SIZE(ppd->pkeys))
+ ret = 0;
+ else
+ ret = ppd->pkeys[index];
+
+ return ret;
+}
+
+/*
+ * Called by readers of cc_state only, must call under rcu_read_lock().
+ */
+static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
+{
+ return rcu_dereference(ppd->cc_state);
+}
+
+/*
+ * Called by writers of cc_state only, must call under cc_state_lock.
+ */
+static inline
+struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd)
+{
+ return rcu_dereference_protected(ppd->cc_state,
+ lockdep_is_held(&ppd->cc_state_lock));
+}
+
+/*
+ * values for dd->flags (_device_ related flags)
+ */
+#define HFI1_INITTED 0x1 /* chip and driver up and initted */
+#define HFI1_PRESENT 0x2 /* chip accesses can be done */
+#define HFI1_FROZEN 0x4 /* chip in SPC freeze */
+#define HFI1_HAS_SDMA_TIMEOUT 0x8
+#define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */
+#define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1)
+
+/* ctxt_flag bit offsets */
+ /* context has been setup */
+#define HFI1_CTXT_SETUP_DONE 1
+ /* waiting for a packet to arrive */
+#define HFI1_CTXT_WAITING_RCV 2
+ /* master has not finished initializing */
+#define HFI1_CTXT_MASTER_UNINIT 4
+ /* waiting for an urgent packet to arrive */
+#define HFI1_CTXT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *,
+ const struct pci_device_id *);
+void hfi1_free_devdata(struct hfi1_devdata *);
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+/* LED beaconing functions */
+void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
+ unsigned int timeoff);
+void shutdown_led_override(struct hfi1_pportdata *ppd);
+
+#define HFI1_CREDIT_RETURN_RATE (100)
+
+/*
+ * The number of words for the KDETH protocol field. If this is
+ * larger then the actual field used, then part of the payload
+ * will be in the header.
+ *
+ * Optimally, we want this sized so that a typical case will
+ * use full cache lines. The typical local KDETH header would
+ * be:
+ *
+ * Bytes Field
+ * 8 LRH
+ * 12 BHT
+ * ?? KDETH
+ * 8 RHF
+ * ---
+ * 28 + KDETH
+ *
+ * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
+ */
+#define DEFAULT_RCVHDRSIZE 9
+
+/*
+ * Maximal header byte count:
+ *
+ * Bytes Field
+ * 8 LRH
+ * 40 GRH (optional)
+ * 12 BTH
+ * ?? KDETH
+ * 8 RHF
+ * ---
+ * 68 + KDETH
+ *
+ * We also want to maintain a cache line alignment to assist DMA'ing
+ * of the header bytes. Round up to a good size.
+ */
+#define DEFAULT_RCVHDR_ENTSIZE 32
+
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
+ u32 nlocked, u32 npages);
+int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr,
+ size_t npages, bool writable, struct page **pages);
+void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
+ size_t npages, bool dirty);
+
+static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+ *((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+ /*
+ * volatile because it's a DMA target from the chip, routine is
+ * inlined, and don't want register caching or reordering.
+ */
+ return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_hfi1_version[];
+
+int hfi1_device_create(struct hfi1_devdata *);
+void hfi1_device_remove(struct hfi1_devdata *);
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+ struct kobject *kobj);
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *);
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *);
+/* Hook for sysfs read of QSFP */
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
+
+int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *);
+void hfi1_pcie_cleanup(struct pci_dev *);
+int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *,
+ const struct pci_device_id *);
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
+void hfi1_pcie_flr(struct hfi1_devdata *);
+int pcie_speeds(struct hfi1_devdata *);
+void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *);
+void hfi1_enable_intx(struct pci_dev *);
+void restore_pci_variables(struct hfi1_devdata *dd);
+int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+int parse_platform_config(struct hfi1_devdata *dd);
+int get_platform_config_field(struct hfi1_devdata *dd,
+ enum platform_config_table_type_encoding
+ table_type, int table_index, int field_index,
+ u32 *data, u32 len);
+
+const char *get_unit_name(int unit);
+const char *get_card_name(struct rvt_dev_info *rdi);
+struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+static inline void flush_wc(void)
+{
+ asm volatile("sfence" : : : "memory");
+}
+
+void handle_eflags(struct hfi1_packet *packet);
+int process_receive_ib(struct hfi1_packet *packet);
+int process_receive_bypass(struct hfi1_packet *packet);
+int process_receive_error(struct hfi1_packet *packet);
+int kdeth_process_expected(struct hfi1_packet *packet);
+int kdeth_process_eager(struct hfi1_packet *packet);
+int process_receive_invalid(struct hfi1_packet *packet);
+
+extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8];
+
+void update_sge(struct rvt_sge_state *ss, u32 length);
+
+/* global module parameter variables */
+extern unsigned int hfi1_max_mtu;
+extern unsigned int hfi1_cu;
+extern unsigned int user_credit_return_threshold;
+extern int num_user_contexts;
+extern unsigned long n_krcvqs;
+extern uint krcvqs[];
+extern int krcvqsset;
+extern uint kdeth_qp;
+extern uint loopback;
+extern uint quick_linkup;
+extern uint rcv_intr_timeout;
+extern uint rcv_intr_count;
+extern uint rcv_intr_dynamic;
+extern ushort link_crc_mask;
+
+extern struct mutex hfi1_mutex;
+
+/* Number of seconds before our card status check... */
+#define STATUS_TIMEOUT 60
+
+#define DRIVER_NAME "hfi1"
+#define HFI1_USER_MINOR_BASE 0
+#define HFI1_TRACE_MINOR 127
+#define HFI1_DIAGPKT_MINOR 128
+#define HFI1_DIAG_MINOR_BASE 129
+#define HFI1_SNOOP_CAPTURE_BASE 200
+#define HFI1_NMINORS 255
+
+#define PCI_VENDOR_ID_INTEL 0x8086
+#define PCI_DEVICE_ID_INTEL0 0x24f0
+#define PCI_DEVICE_ID_INTEL1 0x24f1
+
+#define HFI1_PKT_USER_SC_INTEGRITY \
+ (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK \
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
+
+#define HFI1_PKT_KERNEL_SC_INTEGRITY \
+ (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
+
+static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
+ u16 ctxt_type)
+{
+ u64 base_sc_integrity =
+ SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+ | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
+ | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+ if (ctxt_type == SC_USER)
+ base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
+ else
+ base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
+
+ if (is_ax(dd))
+ /* turn off send-side job key checks - A0 */
+ return base_sc_integrity &
+ ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+ return base_sc_integrity;
+}
+
+static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
+{
+ u64 base_sdma_integrity =
+ SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+ | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
+ | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+ if (is_ax(dd))
+ /* turn off send-side job key checks - A0 */
+ return base_sdma_integrity &
+ ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+ return base_sdma_integrity;
+}
+
+/*
+ * hfi1_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is
+ * the same as dd_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ */
+#define hfi1_early_err(dev, fmt, ...) \
+ dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define hfi1_early_info(dev, fmt, ...) \
+ dev_info(dev, fmt, ##__VA_ARGS__)
+
+#define dd_dev_emerg(dd, fmt, ...) \
+ dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_err(dd, fmt, ...) \
+ dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_warn(dd, fmt, ...) \
+ dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_warn_ratelimited(dd, fmt, ...) \
+ dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_info(dd, fmt, ...) \
+ dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_dbg(dd, fmt, ...) \
+ dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \
+ get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define hfi1_dev_porterr(dd, port, fmt, ...) \
+ dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \
+ get_unit_name((dd)->unit), (port), ##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct hfi1_hwerror_msgs {
+ u64 mask;
+ const char *msg;
+ size_t sz;
+};
+
+/* in intr.c... */
+void hfi1_format_hwerrors(u64 hwerrs,
+ const struct hfi1_hwerror_msgs *hwerrmsgs,
+ size_t nhwerrmsgs, char *msg, size_t lmsg);
+
+#define USER_OPCODE_CHECK_VAL 0xC0
+#define USER_OPCODE_CHECK_MASK 0xC0
+#define OPCODE_CHECK_VAL_DISABLED 0x0
+#define OPCODE_CHECK_MASK_DISABLED 0x0
+
+static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+
+ dd->z_int_counter = get_all_cpu_total(dd->int_counter);
+ dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
+ dd->z_send_schedule = get_all_cpu_total(dd->send_schedule);
+
+ ppd = (struct hfi1_pportdata *)(dd + 1);
+ for (i = 0; i < dd->num_pports; i++, ppd++) {
+ ppd->ibport_data.rvp.z_rc_acks =
+ get_all_cpu_total(ppd->ibport_data.rvp.rc_acks);
+ ppd->ibport_data.rvp.z_rc_qacks =
+ get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks);
+ }
+}
+
+/* Control LED state */
+static inline void setextled(struct hfi1_devdata *dd, u32 on)
+{
+ if (on)
+ write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
+ else
+ write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
+}
+
+/* return the i2c resource given the target */
+static inline u32 i2c_target(u32 target)
+{
+ return target ? CR_I2C2 : CR_I2C1;
+}
+
+/* return the i2c chain chip resource that this HFI uses for QSFP */
+static inline u32 qsfp_resource(struct hfi1_devdata *dd)
+{
+ return i2c_target(dd->hfi1_id);
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
+
+#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype) \
+__print_symbolic(etype, \
+ packettype_name(EXPECTED), \
+ packettype_name(EAGER), \
+ packettype_name(IB), \
+ packettype_name(ERROR), \
+ packettype_name(BYPASS))
+
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode }
+#define show_ib_opcode(opcode) \
+__print_symbolic(opcode, \
+ ib_opcode_name(RC_SEND_FIRST), \
+ ib_opcode_name(RC_SEND_MIDDLE), \
+ ib_opcode_name(RC_SEND_LAST), \
+ ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \
+ ib_opcode_name(RC_SEND_ONLY), \
+ ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \
+ ib_opcode_name(RC_RDMA_WRITE_FIRST), \
+ ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \
+ ib_opcode_name(RC_RDMA_WRITE_LAST), \
+ ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+ ib_opcode_name(RC_RDMA_WRITE_ONLY), \
+ ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+ ib_opcode_name(RC_RDMA_READ_REQUEST), \
+ ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \
+ ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \
+ ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \
+ ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \
+ ib_opcode_name(RC_ACKNOWLEDGE), \
+ ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \
+ ib_opcode_name(RC_COMPARE_SWAP), \
+ ib_opcode_name(RC_FETCH_ADD), \
+ ib_opcode_name(UC_SEND_FIRST), \
+ ib_opcode_name(UC_SEND_MIDDLE), \
+ ib_opcode_name(UC_SEND_LAST), \
+ ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \
+ ib_opcode_name(UC_SEND_ONLY), \
+ ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \
+ ib_opcode_name(UC_RDMA_WRITE_FIRST), \
+ ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \
+ ib_opcode_name(UC_RDMA_WRITE_LAST), \
+ ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+ ib_opcode_name(UC_RDMA_WRITE_ONLY), \
+ ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+ ib_opcode_name(UD_SEND_ONLY), \
+ ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \
+ ib_opcode_name(CNP))
+#endif /* _HFI1_KERNEL_H */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
new file mode 100644
index 000000000000..384b43d2fd49
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -0,0 +1,1836 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/hrtimer.h>
+#include <rdma/rdma_vt.h>
+
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "mad.h"
+#include "sdma.h"
+#include "debugfs.h"
+#include "verbs.h"
+#include "aspm.h"
+#include "affinity.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define HFI1_MIN_USER_CTXT_BUFCNT 7
+
+#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
+#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
+#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
+#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
+
+/*
+ * Number of user receive contexts we are configured to use (to allow for more
+ * pio buffers per ctxt, etc.) Zero means use one user context per CPU.
+ */
+int num_user_contexts = -1;
+module_param_named(num_user_contexts, num_user_contexts, uint, S_IRUGO);
+MODULE_PARM_DESC(
+ num_user_contexts, "Set max number of user contexts to use");
+
+uint krcvqs[RXE_NUM_DATA_VL];
+int krcvqsset;
+module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL");
+
+/* computed based on above array */
+unsigned long n_krcvqs;
+
+static unsigned hfi1_rcvarr_split = 25;
+module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
+
+static uint eager_buffer_size = (2 << 20); /* 2MB */
+module_param(eager_buffer_size, uint, S_IRUGO);
+MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB");
+
+static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
+module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
+
+static uint hfi1_hdrq_entsize = 32;
+module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
+MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
+
+unsigned int user_credit_return_threshold = 33; /* default is 33% */
+module_param(user_credit_return_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
+
+static inline u64 encode_rcv_header_entry_size(u16);
+
+static struct idr hfi1_unit_table;
+u32 hfi1_cpulist_count;
+unsigned long *hfi1_cpulist;
+
+/*
+ * Common code for creating the receive context array.
+ */
+int hfi1_create_ctxts(struct hfi1_devdata *dd)
+{
+ unsigned i;
+ int ret;
+
+ /* Control context has to be always 0 */
+ BUILD_BUG_ON(HFI1_CTRL_CTXT != 0);
+
+ dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd),
+ GFP_KERNEL, dd->node);
+ if (!dd->rcd)
+ goto nomem;
+
+ /* create one or more kernel contexts */
+ for (i = 0; i < dd->first_user_ctxt; ++i) {
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ctxtdata *rcd;
+
+ ppd = dd->pport + (i % dd->num_pports);
+ rcd = hfi1_create_ctxtdata(ppd, i, dd->node);
+ if (!rcd) {
+ dd_dev_err(dd,
+ "Unable to allocate kernel receive context, failing\n");
+ goto nomem;
+ }
+ /*
+ * Set up the kernel context flags here and now because they
+ * use default values for all receive side memories. User
+ * contexts will be handled as they are created.
+ */
+ rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+ HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+ HFI1_CAP_KGET(NODROP_EGR_FULL) |
+ HFI1_CAP_KGET(DMA_RTAIL);
+
+ /* Control context must use DMA_RTAIL */
+ if (rcd->ctxt == HFI1_CTRL_CTXT)
+ rcd->flags |= HFI1_CAP_DMA_RTAIL;
+ rcd->seq_cnt = 1;
+
+ rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
+ if (!rcd->sc) {
+ dd_dev_err(dd,
+ "Unable to allocate kernel send context, failing\n");
+ dd->rcd[rcd->ctxt] = NULL;
+ hfi1_free_ctxtdata(dd, rcd);
+ goto nomem;
+ }
+
+ ret = hfi1_init_ctxt(rcd->sc);
+ if (ret < 0) {
+ dd_dev_err(dd,
+ "Failed to setup kernel receive context, failing\n");
+ sc_free(rcd->sc);
+ dd->rcd[rcd->ctxt] = NULL;
+ hfi1_free_ctxtdata(dd, rcd);
+ ret = -EFAULT;
+ goto bail;
+ }
+ }
+
+ /*
+ * Initialize aspm, to be done after gen3 transition and setting up
+ * contexts and before enabling interrupts
+ */
+ aspm_init(dd);
+
+ return 0;
+nomem:
+ ret = -ENOMEM;
+bail:
+ kfree(dd->rcd);
+ dd->rcd = NULL;
+ return ret;
+}
+
+/*
+ * Common code for user and kernel context setup.
+ */
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt,
+ int numa)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct hfi1_ctxtdata *rcd;
+ unsigned kctxt_ngroups = 0;
+ u32 base;
+
+ if (dd->rcv_entries.nctxt_extra >
+ dd->num_rcv_contexts - dd->first_user_ctxt)
+ kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
+ (dd->num_rcv_contexts - dd->first_user_ctxt));
+ rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
+ if (rcd) {
+ u32 rcvtids, max_entries;
+
+ hfi1_cdbg(PROC, "setting up context %u\n", ctxt);
+
+ INIT_LIST_HEAD(&rcd->qp_wait_list);
+ rcd->ppd = ppd;
+ rcd->dd = dd;
+ rcd->cnt = 1;
+ rcd->ctxt = ctxt;
+ dd->rcd[ctxt] = rcd;
+ rcd->numa_id = numa;
+ rcd->rcv_array_groups = dd->rcv_entries.ngroups;
+
+ mutex_init(&rcd->exp_lock);
+
+ /*
+ * Calculate the context's RcvArray entry starting point.
+ * We do this here because we have to take into account all
+ * the RcvArray entries that previous context would have
+ * taken and we have to account for any extra groups
+ * assigned to the kernel or user contexts.
+ */
+ if (ctxt < dd->first_user_ctxt) {
+ if (ctxt < kctxt_ngroups) {
+ base = ctxt * (dd->rcv_entries.ngroups + 1);
+ rcd->rcv_array_groups++;
+ } else
+ base = kctxt_ngroups +
+ (ctxt * dd->rcv_entries.ngroups);
+ } else {
+ u16 ct = ctxt - dd->first_user_ctxt;
+
+ base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
+ kctxt_ngroups);
+ if (ct < dd->rcv_entries.nctxt_extra) {
+ base += ct * (dd->rcv_entries.ngroups + 1);
+ rcd->rcv_array_groups++;
+ } else
+ base += dd->rcv_entries.nctxt_extra +
+ (ct * dd->rcv_entries.ngroups);
+ }
+ rcd->eager_base = base * dd->rcv_entries.group_size;
+
+ /* Validate and initialize Rcv Hdr Q variables */
+ if (rcvhdrcnt % HDRQ_INCREMENT) {
+ dd_dev_err(dd,
+ "ctxt%u: header queue count %d must be divisible by %lu\n",
+ rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
+ goto bail;
+ }
+ rcd->rcvhdrq_cnt = rcvhdrcnt;
+ rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
+ /*
+ * Simple Eager buffer allocation: we have already pre-allocated
+ * the number of RcvArray entry groups. Each ctxtdata structure
+ * holds the number of groups for that context.
+ *
+ * To follow CSR requirements and maintain cacheline alignment,
+ * make sure all sizes and bases are multiples of group_size.
+ *
+ * The expected entry count is what is left after assigning
+ * eager.
+ */
+ max_entries = rcd->rcv_array_groups *
+ dd->rcv_entries.group_size;
+ rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
+ rcd->egrbufs.count = round_down(rcvtids,
+ dd->rcv_entries.group_size);
+ if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
+ dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
+ rcd->ctxt);
+ rcd->egrbufs.count = MAX_EAGER_ENTRIES;
+ }
+ hfi1_cdbg(PROC,
+ "ctxt%u: max Eager buffer RcvArray entries: %u\n",
+ rcd->ctxt, rcd->egrbufs.count);
+
+ /*
+ * Allocate array that will hold the eager buffer accounting
+ * data.
+ * This will allocate the maximum possible buffer count based
+ * on the value of the RcvArray split parameter.
+ * The resulting value will be rounded down to the closest
+ * multiple of dd->rcv_entries.group_size.
+ */
+ rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count,
+ sizeof(*rcd->egrbufs.buffers),
+ GFP_KERNEL);
+ if (!rcd->egrbufs.buffers)
+ goto bail;
+ rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count,
+ sizeof(*rcd->egrbufs.rcvtids),
+ GFP_KERNEL);
+ if (!rcd->egrbufs.rcvtids)
+ goto bail;
+ rcd->egrbufs.size = eager_buffer_size;
+ /*
+ * The size of the buffers programmed into the RcvArray
+ * entries needs to be big enough to handle the highest
+ * MTU supported.
+ */
+ if (rcd->egrbufs.size < hfi1_max_mtu) {
+ rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
+ hfi1_cdbg(PROC,
+ "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+ rcd->ctxt, rcd->egrbufs.size);
+ }
+ rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
+
+ if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+ rcd->opstats = kzalloc(sizeof(*rcd->opstats),
+ GFP_KERNEL);
+ if (!rcd->opstats)
+ goto bail;
+ }
+ }
+ return rcd;
+bail:
+ kfree(rcd->egrbufs.rcvtids);
+ kfree(rcd->egrbufs.buffers);
+ kfree(rcd);
+ return NULL;
+}
+
+/*
+ * Convert a receive header entry size that to the encoding used in the CSR.
+ *
+ * Return a zero if the given size is invalid.
+ */
+static inline u64 encode_rcv_header_entry_size(u16 size)
+{
+ /* there are only 3 valid receive header entry sizes */
+ if (size == 2)
+ return 1;
+ if (size == 16)
+ return 2;
+ else if (size == 32)
+ return 4;
+ return 0; /* invalid */
+}
+
+/*
+ * Select the largest ccti value over all SLs to determine the intra-
+ * packet gap for the link.
+ *
+ * called with cca_timer_lock held (to protect access to cca_timer
+ * array), and rcu_read_lock() (to protect access to cc_state).
+ */
+void set_link_ipg(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct cc_state *cc_state;
+ int i;
+ u16 cce, ccti_limit, max_ccti = 0;
+ u16 shift, mult;
+ u64 src;
+ u32 current_egress_rate; /* Mbits /sec */
+ u32 max_pkt_time;
+ /*
+ * max_pkt_time is the maximum packet egress time in units
+ * of the fabric clock period 1/(805 MHz).
+ */
+
+ cc_state = get_cc_state(ppd);
+
+ if (!cc_state)
+ /*
+ * This should _never_ happen - rcu_read_lock() is held,
+ * and set_link_ipg() should not be called if cc_state
+ * is NULL.
+ */
+ return;
+
+ for (i = 0; i < OPA_MAX_SLS; i++) {
+ u16 ccti = ppd->cca_timer[i].ccti;
+
+ if (ccti > max_ccti)
+ max_ccti = ccti;
+ }
+
+ ccti_limit = cc_state->cct.ccti_limit;
+ if (max_ccti > ccti_limit)
+ max_ccti = ccti_limit;
+
+ cce = cc_state->cct.entries[max_ccti].entry;
+ shift = (cce & 0xc000) >> 14;
+ mult = (cce & 0x3fff);
+
+ current_egress_rate = active_egress_rate(ppd);
+
+ max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
+
+ src = (max_pkt_time >> shift) * mult;
+
+ src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
+ src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
+
+ write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
+}
+
+static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
+{
+ struct cca_timer *cca_timer;
+ struct hfi1_pportdata *ppd;
+ int sl;
+ u16 ccti_timer, ccti_min;
+ struct cc_state *cc_state;
+ unsigned long flags;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+
+ cca_timer = container_of(t, struct cca_timer, hrtimer);
+ ppd = cca_timer->ppd;
+ sl = cca_timer->sl;
+
+ rcu_read_lock();
+
+ cc_state = get_cc_state(ppd);
+
+ if (!cc_state) {
+ rcu_read_unlock();
+ return HRTIMER_NORESTART;
+ }
+
+ /*
+ * 1) decrement ccti for SL
+ * 2) calculate IPG for link (set_link_ipg())
+ * 3) restart timer, unless ccti is at min value
+ */
+
+ ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
+ ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+
+ spin_lock_irqsave(&ppd->cca_timer_lock, flags);
+
+ if (cca_timer->ccti > ccti_min) {
+ cca_timer->ccti--;
+ set_link_ipg(ppd);
+ }
+
+ if (cca_timer->ccti > ccti_min) {
+ unsigned long nsec = 1024 * ccti_timer;
+ /* ccti_timer is in units of 1.024 usec */
+ hrtimer_forward_now(t, ns_to_ktime(nsec));
+ ret = HRTIMER_RESTART;
+ }
+
+ spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
+ struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
+{
+ int i;
+ uint default_pkey_idx;
+ struct cc_state *cc_state;
+
+ ppd->dd = dd;
+ ppd->hw_pidx = hw_pidx;
+ ppd->port = port; /* IB port number, not index */
+
+ default_pkey_idx = 1;
+
+ ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
+ if (loopback) {
+ hfi1_early_err(&pdev->dev,
+ "Faking data partition 0x8001 in idx %u\n",
+ !default_pkey_idx);
+ ppd->pkeys[!default_pkey_idx] = 0x8001;
+ }
+
+ INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
+ INIT_WORK(&ppd->link_up_work, handle_link_up);
+ INIT_WORK(&ppd->link_down_work, handle_link_down);
+ INIT_WORK(&ppd->freeze_work, handle_freeze);
+ INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
+ INIT_WORK(&ppd->sma_message_work, handle_sma_message);
+ INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
+ INIT_DELAYED_WORK(&ppd->start_link_work, handle_start_link);
+ INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work);
+ INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
+
+ mutex_init(&ppd->hls_lock);
+ spin_lock_init(&ppd->sdma_alllock);
+ spin_lock_init(&ppd->qsfp_info.qsfp_lock);
+
+ ppd->qsfp_info.ppd = ppd;
+ ppd->sm_trap_qp = 0x0;
+ ppd->sa_qp = 0x1;
+
+ ppd->hfi1_wq = NULL;
+
+ spin_lock_init(&ppd->cca_timer_lock);
+
+ for (i = 0; i < OPA_MAX_SLS; i++) {
+ hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ ppd->cca_timer[i].ppd = ppd;
+ ppd->cca_timer[i].sl = i;
+ ppd->cca_timer[i].ccti = 0;
+ ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
+ }
+
+ ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
+
+ spin_lock_init(&ppd->cc_state_lock);
+ spin_lock_init(&ppd->cc_log_lock);
+ cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL);
+ RCU_INIT_POINTER(ppd->cc_state, cc_state);
+ if (!cc_state)
+ goto bail;
+ return;
+
+bail:
+
+ hfi1_early_err(&pdev->dev,
+ "Congestion Control Agent disabled for port %d\n", port);
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct hfi1_devdata *dd)
+{
+ return 0;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the hfi1_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /*
+ * Ensure chip does no sends or receives, tail updates, or
+ * pioavail updates while we re-initialize. This is mostly
+ * for the driver data structures, not chip registers.
+ */
+ for (i = 0; i < dd->num_rcv_contexts; i++)
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+ HFI1_RCVCTRL_INTRAVAIL_DIS |
+ HFI1_RCVCTRL_TAILUPD_DIS, i);
+ pio_send_control(dd, PSC_GLOBAL_DISABLE);
+ for (i = 0; i < dd->num_send_contexts; i++)
+ sc_disable(dd->send_contexts[i].sc);
+
+ return 0;
+}
+
+static void enable_chip(struct hfi1_devdata *dd)
+{
+ u32 rcvmask;
+ u32 i;
+
+ /* enable PIO send */
+ pio_send_control(dd, PSC_GLOBAL_ENABLE);
+
+ /*
+ * Enable kernel ctxts' receive and receive interrupt.
+ * Other ctxts done as user opens and initializes them.
+ */
+ for (i = 0; i < dd->first_user_ctxt; ++i) {
+ rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
+ rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+ HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+ if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
+ rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+ if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL))
+ rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+ if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
+ rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+ hfi1_rcvctrl(dd, rcvmask, i);
+ sc_enable(dd->rcd[i]->sc);
+ }
+}
+
+/**
+ * create_workqueues - create per port workqueues
+ * @dd: the hfi1_ib device
+ */
+static int create_workqueues(struct hfi1_devdata *dd)
+{
+ int pidx;
+ struct hfi1_pportdata *ppd;
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (!ppd->hfi1_wq) {
+ ppd->hfi1_wq =
+ alloc_workqueue(
+ "hfi%d_%d",
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ dd->num_sdma,
+ dd->unit, pidx);
+ if (!ppd->hfi1_wq)
+ goto wq_error;
+ }
+ }
+ return 0;
+wq_error:
+ pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (ppd->hfi1_wq) {
+ destroy_workqueue(ppd->hfi1_wq);
+ ppd->hfi1_wq = NULL;
+ }
+ }
+ return -ENOMEM;
+}
+
+/**
+ * hfi1_init - do the actual initialization sequence on the chip
+ * @dd: the hfi1_ib device
+ * @reinit: re-initializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip. This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0). We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int hfi1_init(struct hfi1_devdata *dd, int reinit)
+{
+ int ret = 0, pidx, lastfail = 0;
+ unsigned i, len;
+ struct hfi1_ctxtdata *rcd;
+ struct hfi1_pportdata *ppd;
+
+ /* Set up recv low level handlers */
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
+ kdeth_process_expected;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
+ kdeth_process_eager;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
+ process_receive_error;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
+ process_receive_bypass;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
+ process_receive_invalid;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
+ process_receive_invalid;
+ dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
+ process_receive_invalid;
+ dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+
+ /* Set up send low level handlers */
+ dd->process_pio_send = hfi1_verbs_send_pio;
+ dd->process_dma_send = hfi1_verbs_send_dma;
+ dd->pio_inline_send = pio_copy;
+
+ if (is_ax(dd)) {
+ atomic_set(&dd->drop_packet, DROP_PACKET_ON);
+ dd->do_drop = 1;
+ } else {
+ atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
+ dd->do_drop = 0;
+ }
+
+ /* make sure the link is not "up" */
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ ppd->linkup = 0;
+ }
+
+ if (reinit)
+ ret = init_after_reset(dd);
+ else
+ ret = loadtime_init(dd);
+ if (ret)
+ goto done;
+
+ /* allocate dummy tail memory for all receive contexts */
+ dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent(
+ &dd->pcidev->dev, sizeof(u64),
+ &dd->rcvhdrtail_dummy_physaddr,
+ GFP_KERNEL);
+
+ if (!dd->rcvhdrtail_dummy_kvaddr) {
+ dd_dev_err(dd, "cannot allocate dummy tail memory\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ /* dd->rcd can be NULL if early initialization failed */
+ for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+ /*
+ * Set up the (kernel) rcvhdr queue and egr TIDs. If doing
+ * re-init, the simplest way to handle this is to free
+ * existing, and re-allocate.
+ * Need to re-create rest of ctxt 0 ctxtdata as well.
+ */
+ rcd = dd->rcd[i];
+ if (!rcd)
+ continue;
+
+ rcd->do_interrupt = &handle_receive_interrupt;
+
+ lastfail = hfi1_create_rcvhdrq(dd, rcd);
+ if (!lastfail)
+ lastfail = hfi1_setup_eagerbufs(rcd);
+ if (lastfail) {
+ dd_dev_err(dd,
+ "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+ ret = lastfail;
+ }
+ }
+
+ /* Allocate enough memory for user event notification. */
+ len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
+ sizeof(*dd->events));
+ dd->events = vmalloc_user(len);
+ if (!dd->events)
+ dd_dev_err(dd, "Failed to allocate user events page\n");
+ /*
+ * Allocate a page for device and port status.
+ * Page will be shared amongst all user processes.
+ */
+ dd->status = vmalloc_user(PAGE_SIZE);
+ if (!dd->status)
+ dd_dev_err(dd, "Failed to allocate dev status page\n");
+ else
+ dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
+ sizeof(dd->status->freezemsg));
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (dd->status)
+ /* Currently, we only have one port */
+ ppd->statusp = &dd->status->port;
+
+ set_mtu(ppd);
+ }
+
+ /* enable chip even if we have an error, so we can debug cause */
+ enable_chip(dd);
+
+done:
+ /*
+ * Set status even if port serdes is not initialized
+ * so that diags will work.
+ */
+ if (dd->status)
+ dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
+ HFI1_STATUS_INITTED;
+ if (!ret) {
+ /* enable all interrupts from the chip */
+ set_intr_state(dd, 1);
+
+ /* chip is OK for user apps; mark it as initialized */
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+
+ /*
+ * start the serdes - must be after interrupts are
+ * enabled so we are notified when the link goes up
+ */
+ lastfail = bringup_serdes(ppd);
+ if (lastfail)
+ dd_dev_info(dd,
+ "Failed to bring up port %u\n",
+ ppd->port);
+
+ /*
+ * Set status even if port serdes is not initialized
+ * so that diags will work.
+ */
+ if (ppd->statusp)
+ *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
+ HFI1_STATUS_INITTED;
+ if (!ppd->link_speed_enabled)
+ continue;
+ }
+ }
+
+ /* if ret is non-zero, we probably should do some cleanup here... */
+ return ret;
+}
+
+static inline struct hfi1_devdata *__hfi1_lookup(int unit)
+{
+ return idr_find(&hfi1_unit_table, unit);
+}
+
+struct hfi1_devdata *hfi1_lookup(int unit)
+{
+ struct hfi1_devdata *dd;
+ unsigned long flags;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ dd = __hfi1_lookup(unit);
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+ return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void stop_timers(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int pidx;
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ if (ppd->led_override_timer.data) {
+ del_timer_sync(&ppd->led_override_timer);
+ atomic_set(&ppd->led_override_timer_active, 0);
+ }
+ }
+}
+
+/**
+ * shutdown_device - shut down a device
+ * @dd: the hfi1_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled. It does not free any data structures.
+ * Everything it does has to be setup again by hfi1_init(dd, 1)
+ */
+static void shutdown_device(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ unsigned pidx;
+ int i;
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+
+ ppd->linkup = 0;
+ if (ppd->statusp)
+ *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+ HFI1_STATUS_IB_READY);
+ }
+ dd->flags &= ~HFI1_INITTED;
+
+ /* mask interrupts, but not errors */
+ set_intr_state(dd, 0);
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+ for (i = 0; i < dd->num_rcv_contexts; i++)
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
+ HFI1_RCVCTRL_CTXT_DIS |
+ HFI1_RCVCTRL_INTRAVAIL_DIS |
+ HFI1_RCVCTRL_PKEY_DIS |
+ HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
+ /*
+ * Gracefully stop all sends allowing any in progress to
+ * trickle out first.
+ */
+ for (i = 0; i < dd->num_send_contexts; i++)
+ sc_flush(dd->send_contexts[i].sc);
+ }
+
+ /*
+ * Enough for anything that's going to trickle out to have actually
+ * done so.
+ */
+ udelay(20);
+
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ ppd = dd->pport + pidx;
+
+ /* disable all contexts */
+ for (i = 0; i < dd->num_send_contexts; i++)
+ sc_disable(dd->send_contexts[i].sc);
+ /* disable the send device */
+ pio_send_control(dd, PSC_GLOBAL_DISABLE);
+
+ shutdown_led_override(ppd);
+
+ /*
+ * Clear SerdesEnable.
+ * We can't count on interrupts since we are stopping.
+ */
+ hfi1_quiet_serdes(ppd);
+
+ if (ppd->hfi1_wq) {
+ destroy_workqueue(ppd->hfi1_wq);
+ ppd->hfi1_wq = NULL;
+ }
+ }
+ sdma_exit(dd);
+}
+
+/**
+ * hfi1_free_ctxtdata - free a context's allocated data
+ * @dd: the hfi1_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of context data, because it is called after hfi1_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ */
+void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+ unsigned e;
+
+ if (!rcd)
+ return;
+
+ if (rcd->rcvhdrq) {
+ dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+ rcd->rcvhdrq, rcd->rcvhdrq_phys);
+ rcd->rcvhdrq = NULL;
+ if (rcd->rcvhdrtail_kvaddr) {
+ dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+ (void *)rcd->rcvhdrtail_kvaddr,
+ rcd->rcvhdrqtailaddr_phys);
+ rcd->rcvhdrtail_kvaddr = NULL;
+ }
+ }
+
+ /* all the RcvArray entries should have been cleared by now */
+ kfree(rcd->egrbufs.rcvtids);
+
+ for (e = 0; e < rcd->egrbufs.alloced; e++) {
+ if (rcd->egrbufs.buffers[e].phys)
+ dma_free_coherent(&dd->pcidev->dev,
+ rcd->egrbufs.buffers[e].len,
+ rcd->egrbufs.buffers[e].addr,
+ rcd->egrbufs.buffers[e].phys);
+ }
+ kfree(rcd->egrbufs.buffers);
+
+ sc_free(rcd->sc);
+ vfree(rcd->user_event_mask);
+ vfree(rcd->subctxt_uregbase);
+ vfree(rcd->subctxt_rcvegrbuf);
+ vfree(rcd->subctxt_rcvhdr_base);
+ kfree(rcd->opstats);
+ kfree(rcd);
+}
+
+/*
+ * Release our hold on the shared asic data. If we are the last one,
+ * return the structure to be finalized outside the lock. Must be
+ * holding hfi1_devs_lock.
+ */
+static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
+{
+ struct hfi1_asic_data *ad;
+ int other;
+
+ if (!dd->asic_data)
+ return NULL;
+ dd->asic_data->dds[dd->hfi1_id] = NULL;
+ other = dd->hfi1_id ? 0 : 1;
+ ad = dd->asic_data;
+ dd->asic_data = NULL;
+ /* return NULL if the other dd still has a link */
+ return ad->dds[other] ? NULL : ad;
+}
+
+static void finalize_asic_data(struct hfi1_devdata *dd,
+ struct hfi1_asic_data *ad)
+{
+ clean_up_i2c(dd, ad);
+ kfree(ad);
+}
+
+static void __hfi1_free_devdata(struct kobject *kobj)
+{
+ struct hfi1_devdata *dd =
+ container_of(kobj, struct hfi1_devdata, kobj);
+ struct hfi1_asic_data *ad;
+ unsigned long flags;
+
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+ idr_remove(&hfi1_unit_table, dd->unit);
+ list_del(&dd->list);
+ ad = release_asic_data(dd);
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ if (ad)
+ finalize_asic_data(dd, ad);
+ free_platform_config(dd);
+ rcu_barrier(); /* wait for rcu callbacks to complete */
+ free_percpu(dd->int_counter);
+ free_percpu(dd->rcv_limit);
+ free_percpu(dd->send_schedule);
+ rvt_dealloc_device(&dd->verbs_dev.rdi);
+}
+
+static struct kobj_type hfi1_devdata_type = {
+ .release = __hfi1_free_devdata,
+};
+
+void hfi1_free_devdata(struct hfi1_devdata *dd)
+{
+ kobject_put(&dd->kobj);
+}
+
+/*
+ * Allocate our primary per-unit data structure. Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+ unsigned long flags;
+ struct hfi1_devdata *dd;
+ int ret, nports;
+
+ /* extra is * number of ports */
+ nports = extra / sizeof(struct hfi1_pportdata);
+
+ dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra,
+ nports);
+ if (!dd)
+ return ERR_PTR(-ENOMEM);
+ dd->num_pports = nports;
+ dd->pport = (struct hfi1_pportdata *)(dd + 1);
+
+ INIT_LIST_HEAD(&dd->list);
+ idr_preload(GFP_KERNEL);
+ spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+ ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
+ if (ret >= 0) {
+ dd->unit = ret;
+ list_add(&dd->list, &hfi1_dev_list);
+ }
+
+ spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ idr_preload_end();
+
+ if (ret < 0) {
+ hfi1_early_err(&pdev->dev,
+ "Could not allocate unit ID: error %d\n", -ret);
+ goto bail;
+ }
+ /*
+ * Initialize all locks for the device. This needs to be as early as
+ * possible so locks are usable.
+ */
+ spin_lock_init(&dd->sc_lock);
+ spin_lock_init(&dd->sendctrl_lock);
+ spin_lock_init(&dd->rcvctrl_lock);
+ spin_lock_init(&dd->uctxt_lock);
+ spin_lock_init(&dd->hfi1_diag_trans_lock);
+ spin_lock_init(&dd->sc_init_lock);
+ spin_lock_init(&dd->dc8051_lock);
+ spin_lock_init(&dd->dc8051_memlock);
+ seqlock_init(&dd->sc2vl_lock);
+ spin_lock_init(&dd->sde_map_lock);
+ spin_lock_init(&dd->pio_map_lock);
+ init_waitqueue_head(&dd->event_queue);
+
+ dd->int_counter = alloc_percpu(u64);
+ if (!dd->int_counter) {
+ ret = -ENOMEM;
+ hfi1_early_err(&pdev->dev,
+ "Could not allocate per-cpu int_counter\n");
+ goto bail;
+ }
+
+ dd->rcv_limit = alloc_percpu(u64);
+ if (!dd->rcv_limit) {
+ ret = -ENOMEM;
+ hfi1_early_err(&pdev->dev,
+ "Could not allocate per-cpu rcv_limit\n");
+ goto bail;
+ }
+
+ dd->send_schedule = alloc_percpu(u64);
+ if (!dd->send_schedule) {
+ ret = -ENOMEM;
+ hfi1_early_err(&pdev->dev,
+ "Could not allocate per-cpu int_counter\n");
+ goto bail;
+ }
+
+ if (!hfi1_cpulist_count) {
+ u32 count = num_online_cpus();
+
+ hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long),
+ GFP_KERNEL);
+ if (hfi1_cpulist)
+ hfi1_cpulist_count = count;
+ else
+ hfi1_early_err(
+ &pdev->dev,
+ "Could not alloc cpulist info, cpu affinity might be wrong\n");
+ }
+ kobject_init(&dd->kobj, &hfi1_devdata_type);
+ return dd;
+
+bail:
+ if (!list_empty(&dd->list))
+ list_del_init(&dd->list);
+ rvt_dealloc_device(&dd->verbs_dev.rdi);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code. Should be paranoid about state of
+ * system and data structures.
+ */
+void hfi1_disable_after_error(struct hfi1_devdata *dd)
+{
+ if (dd->flags & HFI1_INITTED) {
+ u32 pidx;
+
+ dd->flags &= ~HFI1_INITTED;
+ if (dd->pport)
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ struct hfi1_pportdata *ppd;
+
+ ppd = dd->pport + pidx;
+ if (dd->flags & HFI1_PRESENT)
+ set_link_state(ppd, HLS_DN_DISABLE);
+
+ if (ppd->statusp)
+ *ppd->statusp &= ~HFI1_STATUS_IB_READY;
+ }
+ }
+
+ /*
+ * Mark as having had an error for driver, and also
+ * for /sys and status word mapped to user programs.
+ * This marks unit as not usable, until reset.
+ */
+ if (dd->status)
+ dd->status->dev |= HFI1_STATUS_HWERROR;
+}
+
+static void remove_one(struct pci_dev *);
+static int init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
+#define PFX DRIVER_NAME ": "
+
+const struct pci_device_id hfi1_pci_tbl[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
+ { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
+
+static struct pci_driver hfi1_pci_driver = {
+ .name = DRIVER_NAME,
+ .probe = init_one,
+ .remove = remove_one,
+ .id_table = hfi1_pci_tbl,
+ .err_handler = &hfi1_pci_err_handler,
+};
+
+static void __init compute_krcvqs(void)
+{
+ int i;
+
+ for (i = 0; i < krcvqsset; i++)
+ n_krcvqs += krcvqs[i];
+}
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init hfi1_mod_init(void)
+{
+ int ret;
+
+ ret = dev_init();
+ if (ret)
+ goto bail;
+
+ ret = node_affinity_init();
+ if (ret)
+ goto bail;
+
+ /* validate max MTU before any devices start */
+ if (!valid_opa_max_mtu(hfi1_max_mtu)) {
+ pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
+ hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
+ hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+ }
+ /* valid CUs run from 1-128 in powers of 2 */
+ if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
+ hfi1_cu = 1;
+ /* valid credit return threshold is 0-100, variable is unsigned */
+ if (user_credit_return_threshold > 100)
+ user_credit_return_threshold = 100;
+
+ compute_krcvqs();
+ /*
+ * sanitize receive interrupt count, time must wait until after
+ * the hardware type is known
+ */
+ if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
+ rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
+ /* reject invalid combinations */
+ if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
+ pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
+ rcv_intr_count = 1;
+ }
+ if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
+ /*
+ * Avoid indefinite packet delivery by requiring a timeout
+ * if count is > 1.
+ */
+ pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
+ rcv_intr_timeout = 1;
+ }
+ if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
+ /*
+ * The dynamic algorithm expects a non-zero timeout
+ * and a count > 1.
+ */
+ pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
+ rcv_intr_dynamic = 0;
+ }
+
+ /* sanitize link CRC options */
+ link_crc_mask &= SUPPORTED_CRCS;
+
+ /*
+ * These must be called before the driver is registered with
+ * the PCI subsystem.
+ */
+ idr_init(&hfi1_unit_table);
+
+ hfi1_dbg_init();
+ ret = hfi1_wss_init();
+ if (ret < 0)
+ goto bail_wss;
+ ret = pci_register_driver(&hfi1_pci_driver);
+ if (ret < 0) {
+ pr_err("Unable to register driver: error %d\n", -ret);
+ goto bail_dev;
+ }
+ goto bail; /* all OK */
+
+bail_dev:
+ hfi1_wss_exit();
+bail_wss:
+ hfi1_dbg_exit();
+ idr_destroy(&hfi1_unit_table);
+ dev_cleanup();
+bail:
+ return ret;
+}
+
+module_init(hfi1_mod_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit hfi1_mod_cleanup(void)
+{
+ pci_unregister_driver(&hfi1_pci_driver);
+ node_affinity_destroy();
+ hfi1_wss_exit();
+ hfi1_dbg_exit();
+ hfi1_cpulist_count = 0;
+ kfree(hfi1_cpulist);
+
+ idr_destroy(&hfi1_unit_table);
+ dispose_firmware(); /* asymmetric with obtain_firmware() */
+ dev_cleanup();
+}
+
+module_exit(hfi1_mod_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct hfi1_devdata *dd)
+{
+ int ctxt;
+ int pidx;
+ struct hfi1_ctxtdata **tmp;
+ unsigned long flags;
+
+ /* users can't do anything more with chip */
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ struct hfi1_pportdata *ppd = &dd->pport[pidx];
+ struct cc_state *cc_state;
+ int i;
+
+ if (ppd->statusp)
+ *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
+
+ for (i = 0; i < OPA_MAX_SLS; i++)
+ hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
+
+ spin_lock(&ppd->cc_state_lock);
+ cc_state = get_cc_state_protected(ppd);
+ RCU_INIT_POINTER(ppd->cc_state, NULL);
+ spin_unlock(&ppd->cc_state_lock);
+
+ if (cc_state)
+ kfree_rcu(cc_state, rcu);
+ }
+
+ free_credit_return(dd);
+
+ /*
+ * Free any resources still in use (usually just kernel contexts)
+ * at unload; we do for ctxtcnt, because that's what we allocate.
+ * We acquire lock to be really paranoid that rcd isn't being
+ * accessed from some interrupt-related code (that should not happen,
+ * but best to be sure).
+ */
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ tmp = dd->rcd;
+ dd->rcd = NULL;
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+ if (dd->rcvhdrtail_dummy_kvaddr) {
+ dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
+ (void *)dd->rcvhdrtail_dummy_kvaddr,
+ dd->rcvhdrtail_dummy_physaddr);
+ dd->rcvhdrtail_dummy_kvaddr = NULL;
+ }
+
+ for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) {
+ struct hfi1_ctxtdata *rcd = tmp[ctxt];
+
+ tmp[ctxt] = NULL; /* debugging paranoia */
+ if (rcd) {
+ hfi1_clear_tids(rcd);
+ hfi1_free_ctxtdata(dd, rcd);
+ }
+ }
+ kfree(tmp);
+ free_pio_map(dd);
+ /* must follow rcv context free - need to remove rcv's hooks */
+ for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
+ sc_free(dd->send_contexts[ctxt].sc);
+ dd->num_send_contexts = 0;
+ kfree(dd->send_contexts);
+ dd->send_contexts = NULL;
+ kfree(dd->hw_to_sw);
+ dd->hw_to_sw = NULL;
+ kfree(dd->boardname);
+ vfree(dd->events);
+ vfree(dd->status);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void postinit_cleanup(struct hfi1_devdata *dd)
+{
+ hfi1_start_cleanup(dd);
+
+ hfi1_pcie_ddcleanup(dd);
+ hfi1_pcie_cleanup(dd->pcidev);
+
+ cleanup_device_data(dd);
+
+ hfi1_free_devdata(dd);
+}
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ int ret = 0, j, pidx, initfail;
+ struct hfi1_devdata *dd = ERR_PTR(-EINVAL);
+ struct hfi1_pportdata *ppd;
+
+ /* First, lock the non-writable module parameters */
+ HFI1_CAP_LOCK();
+
+ /* Validate some global module parameters */
+ if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
+ hfi1_early_err(&pdev->dev, "Header queue count too small\n");
+ ret = -EINVAL;
+ goto bail;
+ }
+ if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
+ hfi1_early_err(&pdev->dev,
+ "Receive header queue count cannot be greater than %u\n",
+ HFI1_MAX_HDRQ_EGRBUF_CNT);
+ ret = -EINVAL;
+ goto bail;
+ }
+ /* use the encoding function as a sanitization check */
+ if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
+ hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
+ hfi1_hdrq_entsize);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* The receive eager buffer size must be set before the receive
+ * contexts are created.
+ *
+ * Set the eager buffer size. Validate that it falls in a range
+ * allowed by the hardware - all powers of 2 between the min and
+ * max. The maximum valid MTU is within the eager buffer range
+ * so we do not need to cap the max_mtu by an eager buffer size
+ * setting.
+ */
+ if (eager_buffer_size) {
+ if (!is_power_of_2(eager_buffer_size))
+ eager_buffer_size =
+ roundup_pow_of_two(eager_buffer_size);
+ eager_buffer_size =
+ clamp_val(eager_buffer_size,
+ MIN_EAGER_BUFFER * 8,
+ MAX_EAGER_BUFFER_TOTAL);
+ hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
+ eager_buffer_size);
+ } else {
+ hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ /* restrict value of hfi1_rcvarr_split */
+ hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
+
+ ret = hfi1_pcie_init(pdev, ent);
+ if (ret)
+ goto bail;
+
+ /*
+ * Do device-specific initialization, function table setup, dd
+ * allocation, etc.
+ */
+ switch (ent->device) {
+ case PCI_DEVICE_ID_INTEL0:
+ case PCI_DEVICE_ID_INTEL1:
+ dd = hfi1_init_dd(pdev, ent);
+ break;
+ default:
+ hfi1_early_err(&pdev->dev,
+ "Failing on unknown Intel deviceid 0x%x\n",
+ ent->device);
+ ret = -ENODEV;
+ }
+
+ if (IS_ERR(dd))
+ ret = PTR_ERR(dd);
+ if (ret)
+ goto clean_bail; /* error already printed */
+
+ ret = create_workqueues(dd);
+ if (ret)
+ goto clean_bail;
+
+ /* do the generic initialization */
+ initfail = hfi1_init(dd, 0);
+
+ ret = hfi1_register_ib_device(dd);
+
+ /*
+ * Now ready for use. this should be cleared whenever we
+ * detect a reset, or initiate one. If earlier failure,
+ * we still create devices, so diags, etc. can be used
+ * to determine cause of problem.
+ */
+ if (!initfail && !ret) {
+ dd->flags |= HFI1_INITTED;
+ /* create debufs files after init and ib register */
+ hfi1_dbg_ibdev_init(&dd->verbs_dev);
+ }
+
+ j = hfi1_device_create(dd);
+ if (j)
+ dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+
+ if (initfail || ret) {
+ stop_timers(dd);
+ flush_workqueue(ib_wq);
+ for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+ hfi1_quiet_serdes(dd->pport + pidx);
+ ppd = dd->pport + pidx;
+ if (ppd->hfi1_wq) {
+ destroy_workqueue(ppd->hfi1_wq);
+ ppd->hfi1_wq = NULL;
+ }
+ }
+ if (!j)
+ hfi1_device_remove(dd);
+ if (!ret)
+ hfi1_unregister_ib_device(dd);
+ postinit_cleanup(dd);
+ if (initfail)
+ ret = initfail;
+ goto bail; /* everything already cleaned */
+ }
+
+ sdma_start(dd);
+
+ return 0;
+
+clean_bail:
+ hfi1_pcie_cleanup(pdev);
+bail:
+ return ret;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+ /* close debugfs files before ib unregister */
+ hfi1_dbg_ibdev_exit(&dd->verbs_dev);
+ /* unregister from IB core */
+ hfi1_unregister_ib_device(dd);
+
+ /*
+ * Disable the IB link, disable interrupts on the device,
+ * clear dma engines, etc.
+ */
+ shutdown_device(dd);
+
+ stop_timers(dd);
+
+ /* wait until all of our (qsfp) queue_work() calls complete */
+ flush_workqueue(ib_wq);
+
+ hfi1_device_remove(dd);
+
+ postinit_cleanup(dd);
+}
+
+/**
+ * hfi1_create_rcvhdrq - create a receive header queue
+ * @dd: the hfi1_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+ unsigned amt;
+ u64 reg;
+
+ if (!rcd->rcvhdrq) {
+ dma_addr_t phys_hdrqtail;
+ gfp_t gfp_flags;
+
+ /*
+ * rcvhdrqentsize is in DWs, so we have to convert to bytes
+ * (* sizeof(u32)).
+ */
+ amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
+ sizeof(u32));
+
+ gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
+ GFP_USER : GFP_KERNEL;
+ rcd->rcvhdrq = dma_zalloc_coherent(
+ &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
+ gfp_flags | __GFP_COMP);
+
+ if (!rcd->rcvhdrq) {
+ dd_dev_err(dd,
+ "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+ amt, rcd->ctxt);
+ goto bail;
+ }
+
+ if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+ rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
+ &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+ gfp_flags);
+ if (!rcd->rcvhdrtail_kvaddr)
+ goto bail_free;
+ rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
+ }
+
+ rcd->rcvhdrq_size = amt;
+ }
+ /*
+ * These values are per-context:
+ * RcvHdrCnt
+ * RcvHdrEntSize
+ * RcvHdrSize
+ */
+ reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
+ & RCV_HDR_CNT_CNT_MASK)
+ << RCV_HDR_CNT_CNT_SHIFT;
+ write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
+ reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
+ & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
+ << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
+ write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
+ reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
+ << RCV_HDR_SIZE_HDR_SIZE_SHIFT;
+ write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
+
+ /*
+ * Program dummy tail address for every receive context
+ * before enabling any receive context
+ */
+ write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR,
+ dd->rcvhdrtail_dummy_physaddr);
+
+ return 0;
+
+bail_free:
+ dd_dev_err(dd,
+ "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+ rcd->ctxt);
+ vfree(rcd->user_event_mask);
+ rcd->user_event_mask = NULL;
+ dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+ rcd->rcvhdrq_phys);
+ rcd->rcvhdrq = NULL;
+bail:
+ return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls. Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
+ gfp_t gfp_flags;
+ u16 order;
+ int ret = 0;
+ u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
+
+ /*
+ * GFP_USER, but without GFP_FS, so buffer cache can be
+ * coalesced (we hope); otherwise, even at order 4,
+ * heavy filesystem activity makes these fail, and we can
+ * use compound pages.
+ */
+ gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP;
+
+ /*
+ * The minimum size of the eager buffers is a groups of MTU-sized
+ * buffers.
+ * The global eager_buffer_size parameter is checked against the
+ * theoretical lower limit of the value. Here, we check against the
+ * MTU.
+ */
+ if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
+ rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
+ /*
+ * If using one-pkt-per-egr-buffer, lower the eager buffer
+ * size to the max MTU (page-aligned).
+ */
+ if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
+ rcd->egrbufs.rcvtid_size = round_mtu;
+
+ /*
+ * Eager buffers sizes of 1MB or less require smaller TID sizes
+ * to satisfy the "multiple of 8 RcvArray entries" requirement.
+ */
+ if (rcd->egrbufs.size <= (1 << 20))
+ rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
+ rounddown_pow_of_two(rcd->egrbufs.size / 8));
+
+ while (alloced_bytes < rcd->egrbufs.size &&
+ rcd->egrbufs.alloced < rcd->egrbufs.count) {
+ rcd->egrbufs.buffers[idx].addr =
+ dma_zalloc_coherent(&dd->pcidev->dev,
+ rcd->egrbufs.rcvtid_size,
+ &rcd->egrbufs.buffers[idx].phys,
+ gfp_flags);
+ if (rcd->egrbufs.buffers[idx].addr) {
+ rcd->egrbufs.buffers[idx].len =
+ rcd->egrbufs.rcvtid_size;
+ rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
+ rcd->egrbufs.buffers[idx].addr;
+ rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys =
+ rcd->egrbufs.buffers[idx].phys;
+ rcd->egrbufs.alloced++;
+ alloced_bytes += rcd->egrbufs.rcvtid_size;
+ idx++;
+ } else {
+ u32 new_size, i, j;
+ u64 offset = 0;
+
+ /*
+ * Fail the eager buffer allocation if:
+ * - we are already using the lowest acceptable size
+ * - we are using one-pkt-per-egr-buffer (this implies
+ * that we are accepting only one size)
+ */
+ if (rcd->egrbufs.rcvtid_size == round_mtu ||
+ !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
+ dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
+ rcd->ctxt);
+ goto bail_rcvegrbuf_phys;
+ }
+
+ new_size = rcd->egrbufs.rcvtid_size / 2;
+
+ /*
+ * If the first attempt to allocate memory failed, don't
+ * fail everything but continue with the next lower
+ * size.
+ */
+ if (idx == 0) {
+ rcd->egrbufs.rcvtid_size = new_size;
+ continue;
+ }
+
+ /*
+ * Re-partition already allocated buffers to a smaller
+ * size.
+ */
+ rcd->egrbufs.alloced = 0;
+ for (i = 0, j = 0, offset = 0; j < idx; i++) {
+ if (i >= rcd->egrbufs.count)
+ break;
+ rcd->egrbufs.rcvtids[i].phys =
+ rcd->egrbufs.buffers[j].phys + offset;
+ rcd->egrbufs.rcvtids[i].addr =
+ rcd->egrbufs.buffers[j].addr + offset;
+ rcd->egrbufs.alloced++;
+ if ((rcd->egrbufs.buffers[j].phys + offset +
+ new_size) ==
+ (rcd->egrbufs.buffers[j].phys +
+ rcd->egrbufs.buffers[j].len)) {
+ j++;
+ offset = 0;
+ } else {
+ offset += new_size;
+ }
+ }
+ rcd->egrbufs.rcvtid_size = new_size;
+ }
+ }
+ rcd->egrbufs.numbufs = idx;
+ rcd->egrbufs.size = alloced_bytes;
+
+ hfi1_cdbg(PROC,
+ "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+ rcd->ctxt, rcd->egrbufs.alloced,
+ rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
+
+ /*
+ * Set the contexts rcv array head update threshold to the closest
+ * power of 2 (so we can use a mask instead of modulo) below half
+ * the allocated entries.
+ */
+ rcd->egrbufs.threshold =
+ rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
+ /*
+ * Compute the expected RcvArray entry base. This is done after
+ * allocating the eager buffers in order to maximize the
+ * expected RcvArray entries for the context.
+ */
+ max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
+ egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
+ rcd->expected_count = max_entries - egrtop;
+ if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
+ rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
+
+ rcd->expected_base = rcd->eager_base + egrtop;
+ hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
+ rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
+ rcd->eager_base, rcd->expected_base);
+
+ if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
+ hfi1_cdbg(PROC,
+ "ctxt%u: current Eager buffer size is invalid %u\n",
+ rcd->ctxt, rcd->egrbufs.rcvtid_size);
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
+ hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
+ rcd->egrbufs.rcvtids[idx].phys, order);
+ cond_resched();
+ }
+ goto bail;
+
+bail_rcvegrbuf_phys:
+ for (idx = 0; idx < rcd->egrbufs.alloced &&
+ rcd->egrbufs.buffers[idx].addr;
+ idx++) {
+ dma_free_coherent(&dd->pcidev->dev,
+ rcd->egrbufs.buffers[idx].len,
+ rcd->egrbufs.buffers[idx].addr,
+ rcd->egrbufs.buffers[idx].phys);
+ rcd->egrbufs.buffers[idx].addr = NULL;
+ rcd->egrbufs.buffers[idx].phys = 0;
+ rcd->egrbufs.buffers[idx].len = 0;
+ }
+bail:
+ return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c
new file mode 100644
index 000000000000..65348d16ab2f
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/intr.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "sdma.h"
+
+/**
+ * format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+ strlcat(msg, "[", msgl);
+ strlcat(msg, hwmsg, msgl);
+ strlcat(msg, "]", msgl);
+}
+
+/**
+ * hfi1_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
+ size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+ int i;
+
+ for (i = 0; i < nhwerrmsgs; i++)
+ if (hwerrs & hwerrmsgs[i].mask)
+ format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
+{
+ struct ib_event event;
+ struct hfi1_devdata *dd = ppd->dd;
+
+ /*
+ * Only call ib_dispatch_event() if the IB device has been
+ * registered. HFI1_INITED is set iff the driver has successfully
+ * registered with the IB core.
+ */
+ if (!(dd->flags & HFI1_INITTED))
+ return;
+ event.device = &dd->verbs_dev.rdi.ibdev;
+ event.element.port_num = ppd->port;
+ event.event = ev;
+ ib_dispatch_event(&event);
+}
+
+/*
+ * Handle a linkup or link down notification.
+ * This is called outside an interrupt.
+ */
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
+{
+ struct hfi1_pportdata *ppd = &dd->pport[0];
+ enum ib_event_type ev;
+
+ if (!(ppd->linkup ^ !!linkup))
+ return; /* no change, nothing to do */
+
+ if (linkup) {
+ /*
+ * Quick linkup and all link up on the simulator does not
+ * trigger or implement:
+ * - VerifyCap interrupt
+ * - VerifyCap frames
+ * But rather moves directly to LinkUp.
+ *
+ * Do the work of the VerifyCap interrupt handler,
+ * handle_verify_cap(), but do not try moving the state to
+ * LinkUp as we are already there.
+ *
+ * NOTE: This uses this device's vAU, vCU, and vl15_init for
+ * the remote values. Both sides must be using the values.
+ */
+ if (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+ set_up_vl15(dd, dd->vau, dd->vl15_init);
+ assign_remote_cm_au_table(dd, dd->vcu);
+ ppd->neighbor_guid =
+ read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+ ppd->neighbor_type =
+ read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+ DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+ ppd->neighbor_port_number =
+ read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+ DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+ dd_dev_info(dd, "Neighbor GUID: %llx Neighbor type %d\n",
+ ppd->neighbor_guid,
+ ppd->neighbor_type);
+ }
+
+ /* physical link went up */
+ ppd->linkup = 1;
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+
+ /* link widths are not available until the link is fully up */
+ get_linkup_link_widths(ppd);
+
+ } else {
+ /* physical link went down */
+ ppd->linkup = 0;
+
+ /* clear HW details of the previous connection */
+ reset_link_credits(dd);
+
+ /* freeze after a link down to guarantee a clean egress */
+ start_freeze_handling(ppd, FREEZE_SELF | FREEZE_LINK_DOWN);
+
+ ev = IB_EVENT_PORT_ERR;
+
+ hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
+
+ /* if we are down, the neighbor is down */
+ ppd->neighbor_normal = 0;
+
+ /* notify IB of the link change */
+ signal_ib_event(ppd, ev);
+ }
+}
+
+/*
+ * Handle receive or urgent interrupts for user contexts. This means a user
+ * process was waiting for a packet to arrive, and didn't want to poll.
+ */
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_devdata *dd = rcd->dd;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->uctxt_lock, flags);
+ if (!rcd->cnt)
+ goto done;
+
+ if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
+ wake_up_interruptible(&rcd->wait);
+ hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
+ } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
+ &rcd->event_flags)) {
+ rcd->urgent++;
+ wake_up_interruptible(&rcd->wait);
+ }
+done:
+ spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
new file mode 100644
index 000000000000..2ec6ef38d389
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -0,0 +1,300 @@
+#ifndef _HFI1_IOWAIT_H
+#define _HFI1_IOWAIT_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+#include "sdma_txreq.h"
+
+/*
+ * typedef (*restart_t)() - restart callback
+ * @work: pointer to work structure
+ */
+typedef void (*restart_t)(struct work_struct *work);
+
+struct sdma_txreq;
+struct sdma_engine;
+/**
+ * struct iowait - linkage for delayed progress/waiting
+ * @list: used to add/insert into QP/PQ wait lists
+ * @tx_head: overflow list of sdma_txreq's
+ * @sleep: no space callback
+ * @wakeup: space callback wakeup
+ * @sdma_drained: sdma count drained
+ * @iowork: workqueue overhead
+ * @wait_dma: wait for sdma_busy == 0
+ * @wait_pio: wait for pio_busy == 0
+ * @sdma_busy: # of packets in flight
+ * @count: total number of descriptors in tx_head'ed list
+ * @tx_limit: limit for overflow queuing
+ * @tx_count: number of tx entry's in tx_head'ed list
+ *
+ * This is to be embedded in user's state structure
+ * (QP or PQ).
+ *
+ * The sleep and wakeup members are a
+ * bit misnamed. They do not strictly
+ * speaking sleep or wake up, but they
+ * are callbacks for the ULP to implement
+ * what ever queuing/dequeuing of
+ * the embedded iowait and its containing struct
+ * when a resource shortage like SDMA ring space is seen.
+ *
+ * Both potentially have locks help
+ * so sleeping is not allowed.
+ *
+ * The wait_dma member along with the iow
+ */
+
+struct iowait {
+ struct list_head list;
+ struct list_head tx_head;
+ int (*sleep)(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx,
+ unsigned seq);
+ void (*wakeup)(struct iowait *wait, int reason);
+ void (*sdma_drained)(struct iowait *wait);
+ struct work_struct iowork;
+ wait_queue_head_t wait_dma;
+ wait_queue_head_t wait_pio;
+ atomic_t sdma_busy;
+ atomic_t pio_busy;
+ u32 count;
+ u32 tx_limit;
+ u32 tx_count;
+};
+
+#define SDMA_AVAIL_REASON 0
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @resume: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+
+static inline void iowait_init(
+ struct iowait *wait,
+ u32 tx_limit,
+ void (*func)(struct work_struct *work),
+ int (*sleep)(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx,
+ unsigned seq),
+ void (*wakeup)(struct iowait *wait, int reason),
+ void (*sdma_drained)(struct iowait *wait))
+{
+ wait->count = 0;
+ INIT_LIST_HEAD(&wait->list);
+ INIT_LIST_HEAD(&wait->tx_head);
+ INIT_WORK(&wait->iowork, func);
+ init_waitqueue_head(&wait->wait_dma);
+ init_waitqueue_head(&wait->wait_pio);
+ atomic_set(&wait->sdma_busy, 0);
+ atomic_set(&wait->pio_busy, 0);
+ wait->tx_limit = tx_limit;
+ wait->sleep = sleep;
+ wait->wakeup = wakeup;
+ wait->sdma_drained = sdma_drained;
+}
+
+/**
+ * iowait_schedule() - initialize wait structure
+ * @wait: wait struct to schedule
+ * @wq: workqueue for schedule
+ * @cpu: cpu
+ */
+static inline void iowait_schedule(
+ struct iowait *wait,
+ struct workqueue_struct *wq,
+ int cpu)
+{
+ queue_work_on(cpu, wq, &wait->iowork);
+}
+
+/**
+ * iowait_sdma_drain() - wait for DMAs to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait sdmas have
+ * completed.
+ */
+static inline void iowait_sdma_drain(struct iowait *wait)
+{
+ wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
+}
+
+/**
+ * iowait_sdma_pending() - return sdma pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_sdma_pending(struct iowait *wait)
+{
+ return atomic_read(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_inc - note sdma io pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_inc(struct iowait *wait)
+{
+ atomic_inc(&wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_add - add count to pending
+ * @wait: iowait structure
+ */
+static inline void iowait_sdma_add(struct iowait *wait, int count)
+{
+ atomic_add(count, &wait->sdma_busy);
+}
+
+/**
+ * iowait_sdma_dec - note sdma complete
+ * @wait: iowait structure
+ */
+static inline int iowait_sdma_dec(struct iowait *wait)
+{
+ return atomic_dec_and_test(&wait->sdma_busy);
+}
+
+/**
+ * iowait_pio_drain() - wait for pios to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait pios have
+ * completed.
+ */
+static inline void iowait_pio_drain(struct iowait *wait)
+{
+ wait_event_timeout(wait->wait_pio,
+ !atomic_read(&wait->pio_busy),
+ HZ);
+}
+
+/**
+ * iowait_pio_pending() - return pio pending count
+ *
+ * @wait: iowait structure
+ *
+ */
+static inline int iowait_pio_pending(struct iowait *wait)
+{
+ return atomic_read(&wait->pio_busy);
+}
+
+/**
+ * iowait_pio_inc - note pio pending
+ * @wait: iowait structure
+ */
+static inline void iowait_pio_inc(struct iowait *wait)
+{
+ atomic_inc(&wait->pio_busy);
+}
+
+/**
+ * iowait_sdma_dec - note pio complete
+ * @wait: iowait structure
+ */
+static inline int iowait_pio_dec(struct iowait *wait)
+{
+ return atomic_dec_and_test(&wait->pio_busy);
+}
+
+/**
+ * iowait_drain_wakeup() - trigger iowait_drain() waiter
+ *
+ * @wait: iowait structure
+ *
+ * This will trigger any waiters.
+ */
+static inline void iowait_drain_wakeup(struct iowait *wait)
+{
+ wake_up(&wait->wait_dma);
+ wake_up(&wait->wait_pio);
+ if (wait->sdma_drained)
+ wait->sdma_drained(wait);
+}
+
+/**
+ * iowait_get_txhead() - get packet off of iowait list
+ *
+ * @wait wait struture
+ */
+static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
+{
+ struct sdma_txreq *tx = NULL;
+
+ if (!list_empty(&wait->tx_head)) {
+ tx = list_first_entry(
+ &wait->tx_head,
+ struct sdma_txreq,
+ list);
+ list_del_init(&tx->list);
+ }
+ return tx;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
new file mode 100644
index 000000000000..7ffc14f21523
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -0,0 +1,4434 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
+ / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+#include "qp.h"
+
+/* the reset value from the FM is supposed to be 0xffff, handle both */
+#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
+#define OPA_LINK_WIDTH_RESET 0xffff
+
+static int reply(struct ib_mad_hdr *smp)
+{
+ /*
+ * The verbs framework will handle the directed/LID route
+ * packet changes.
+ */
+ smp->method = IB_MGMT_METHOD_GET_RESP;
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ smp->status |= IB_SMP_DIRECTION;
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static inline void clear_opa_smp_data(struct opa_smp *smp)
+{
+ void *data = opa_get_smp_data(smp);
+ size_t size = opa_get_smp_data_size(smp);
+
+ memset(data, 0, size);
+}
+
+void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port)
+{
+ struct ib_event event;
+
+ event.event = IB_EVENT_PKEY_CHANGE;
+ event.device = &dd->verbs_dev.rdi.ibdev;
+ event.element.port_num = port;
+ ib_dispatch_event(&event);
+}
+
+static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
+{
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_agent *agent;
+ struct opa_smp *smp;
+ int ret;
+ unsigned long flags;
+ unsigned long timeout;
+ int pkey_idx;
+ u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
+
+ agent = ibp->rvp.send_agent;
+ if (!agent)
+ return;
+
+ /* o14-3.2.1 */
+ if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
+ return;
+
+ /* o14-2 */
+ if (ibp->rvp.trap_timeout && time_before(jiffies,
+ ibp->rvp.trap_timeout))
+ return;
+
+ pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+ if (pkey_idx < 0) {
+ pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
+ __func__, hfi1_get_pkey(ibp, 1));
+ pkey_idx = 1;
+ }
+
+ send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
+ IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC, IB_MGMT_BASE_VERSION);
+ if (IS_ERR(send_buf))
+ return;
+
+ smp = send_buf->mad;
+ smp->base_version = OPA_MGMT_BASE_VERSION;
+ smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ smp->class_version = OPA_SMI_CLASS_VERSION;
+ smp->method = IB_MGMT_METHOD_TRAP;
+ ibp->rvp.tid++;
+ smp->tid = cpu_to_be64(ibp->rvp.tid);
+ smp->attr_id = IB_SMP_ATTR_NOTICE;
+ /* o14-1: smp->mkey = 0; */
+ memcpy(smp->route.lid.data, data, len);
+
+ spin_lock_irqsave(&ibp->rvp.lock, flags);
+ if (!ibp->rvp.sm_ah) {
+ if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+ struct ib_ah *ah;
+
+ ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ } else {
+ send_buf->ah = ah;
+ ibp->rvp.sm_ah = ibah_to_rvtah(ah);
+ ret = 0;
+ }
+ } else {
+ ret = -EINVAL;
+ }
+ } else {
+ send_buf->ah = &ibp->rvp.sm_ah->ibah;
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+ if (!ret)
+ ret = ib_post_send_mad(send_buf, NULL);
+ if (!ret) {
+ /* 4.096 usec. */
+ timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000;
+ ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout);
+ } else {
+ ib_free_send_mad(send_buf);
+ ibp->rvp.trap_timeout = 0;
+ }
+}
+
+/*
+ * Send a bad [PQ]_Key trap (ch. 14.3.8).
+ */
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+ u32 qp1, u32 qp2, u16 lid1, u16 lid2)
+{
+ struct opa_mad_notice_attr data;
+ u32 lid = ppd_from_ibp(ibp)->lid;
+ u32 _lid1 = lid1;
+ u32 _lid2 = lid2;
+
+ memset(&data, 0, sizeof(data));
+
+ if (trap_num == OPA_TRAP_BAD_P_KEY)
+ ibp->rvp.pkey_violations++;
+ else
+ ibp->rvp.qkey_violations++;
+ ibp->rvp.n_pkt_drops++;
+
+ /* Send violation trap */
+ data.generic_type = IB_NOTICE_TYPE_SECURITY;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = trap_num;
+ data.issuer_lid = cpu_to_be32(lid);
+ data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
+ data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
+ data.ntc_257_258.key = cpu_to_be32(key);
+ data.ntc_257_258.sl = sl << 3;
+ data.ntc_257_258.qp1 = cpu_to_be32(qp1);
+ data.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+ __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
+{
+ struct opa_mad_notice_attr data;
+ u32 lid = ppd_from_ibp(ibp)->lid;
+
+ memset(&data, 0, sizeof(data));
+ /* Send violation trap */
+ data.generic_type = IB_NOTICE_TYPE_SECURITY;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = OPA_TRAP_BAD_M_KEY;
+ data.issuer_lid = cpu_to_be32(lid);
+ data.ntc_256.lid = data.issuer_lid;
+ data.ntc_256.method = mad->method;
+ data.ntc_256.attr_id = mad->attr_id;
+ data.ntc_256.attr_mod = mad->attr_mod;
+ data.ntc_256.mkey = mkey;
+ if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ data.ntc_256.dr_slid = dr_slid;
+ data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+ if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) {
+ data.ntc_256.dr_trunc_hop |=
+ IB_NOTICE_TRAP_DR_TRUNC;
+ hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path);
+ }
+ data.ntc_256.dr_trunc_hop |= hop_cnt;
+ memcpy(data.ntc_256.dr_rtn_path, return_path,
+ hop_cnt);
+ }
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
+{
+ struct opa_mad_notice_attr data;
+ struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+ struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+ struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
+ u32 lid = ppd_from_ibp(ibp)->lid;
+
+ memset(&data, 0, sizeof(data));
+
+ data.generic_type = IB_NOTICE_TYPE_INFO;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
+ data.issuer_lid = cpu_to_be32(lid);
+ data.ntc_144.lid = data.issuer_lid;
+ data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
+{
+ struct opa_mad_notice_attr data;
+ u32 lid = ppd_from_ibp(ibp)->lid;
+
+ memset(&data, 0, sizeof(data));
+
+ data.generic_type = IB_NOTICE_TYPE_INFO;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = OPA_TRAP_CHANGE_SYSGUID;
+ data.issuer_lid = cpu_to_be32(lid);
+ data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
+ data.ntc_145.lid = data.issuer_lid;
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
+{
+ struct opa_mad_notice_attr data;
+ u32 lid = ppd_from_ibp(ibp)->lid;
+
+ memset(&data, 0, sizeof(data));
+
+ data.generic_type = IB_NOTICE_TYPE_INFO;
+ data.prod_type_lsb = IB_NOTICE_PROD_CA;
+ data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
+ data.issuer_lid = cpu_to_be32(lid);
+ data.ntc_144.lid = data.issuer_lid;
+ data.ntc_144.change_flags =
+ cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG);
+
+ send_trap(ibp, &data, sizeof(data));
+}
+
+static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
+ u8 *data, struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ struct opa_node_description *nd;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ nd = (struct opa_node_description *)data;
+
+ memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
+
+ if (resp_len)
+ *resp_len += sizeof(*nd);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct opa_node_info *ni;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+ ni = (struct opa_node_info *)data;
+
+ /* GUID 0 is illegal */
+ if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+ ni->base_version = OPA_MGMT_BASE_VERSION;
+ ni->class_version = OPA_SMI_CLASS_VERSION;
+ ni->node_type = 1; /* channel adapter */
+ ni->num_ports = ibdev->phys_port_cnt;
+ /* This is already in network order */
+ ni->system_image_guid = ib_hfi1_sys_image_guid;
+ /* Use first-port GUID as node */
+ ni->node_guid = cpu_to_be64(dd->pport->guid);
+ ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+ ni->device_id = cpu_to_be16(dd->pcidev->device);
+ ni->revision = cpu_to_be32(dd->minrev);
+ ni->local_port_num = port;
+ ni->vendor_id[0] = dd->oui1;
+ ni->vendor_id[1] = dd->oui2;
+ ni->vendor_id[2] = dd->oui3;
+
+ if (resp_len)
+ *resp_len += sizeof(*ni);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+ u8 port)
+{
+ struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+ /* GUID 0 is illegal */
+ if (smp->attr_mod || pidx >= dd->num_pports ||
+ dd->pport[pidx].guid == 0)
+ smp->status |= IB_SMP_INVALID_FIELD;
+ else
+ nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+
+ nip->base_version = OPA_MGMT_BASE_VERSION;
+ nip->class_version = OPA_SMI_CLASS_VERSION;
+ nip->node_type = 1; /* channel adapter */
+ nip->num_ports = ibdev->phys_port_cnt;
+ /* This is already in network order */
+ nip->sys_guid = ib_hfi1_sys_image_guid;
+ /* Use first-port GUID as node */
+ nip->node_guid = cpu_to_be64(dd->pport->guid);
+ nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+ nip->device_id = cpu_to_be16(dd->pcidev->device);
+ nip->revision = cpu_to_be32(dd->minrev);
+ nip->local_port_num = port;
+ nip->vendor_id[0] = dd->oui1;
+ nip->vendor_id[1] = dd->oui2;
+ nip->vendor_id[2] = dd->oui3;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
+}
+
+static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
+{
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
+}
+
+static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+ int mad_flags, __be64 mkey, __be32 dr_slid,
+ u8 return_path[], u8 hop_cnt)
+{
+ int valid_mkey = 0;
+ int ret = 0;
+
+ /* Is the mkey in the process of expiring? */
+ if (ibp->rvp.mkey_lease_timeout &&
+ time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) {
+ /* Clear timeout and mkey protection field. */
+ ibp->rvp.mkey_lease_timeout = 0;
+ ibp->rvp.mkeyprot = 0;
+ }
+
+ if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->rvp.mkey == 0 ||
+ ibp->rvp.mkey == mkey)
+ valid_mkey = 1;
+
+ /* Unset lease timeout on any valid Get/Set/TrapRepress */
+ if (valid_mkey && ibp->rvp.mkey_lease_timeout &&
+ (mad->method == IB_MGMT_METHOD_GET ||
+ mad->method == IB_MGMT_METHOD_SET ||
+ mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
+ ibp->rvp.mkey_lease_timeout = 0;
+
+ if (!valid_mkey) {
+ switch (mad->method) {
+ case IB_MGMT_METHOD_GET:
+ /* Bad mkey not a violation below level 2 */
+ if (ibp->rvp.mkeyprot < 2)
+ break;
+ case IB_MGMT_METHOD_SET:
+ case IB_MGMT_METHOD_TRAP_REPRESS:
+ if (ibp->rvp.mkey_violations != 0xFFFF)
+ ++ibp->rvp.mkey_violations;
+ if (!ibp->rvp.mkey_lease_timeout &&
+ ibp->rvp.mkey_lease_period)
+ ibp->rvp.mkey_lease_timeout = jiffies +
+ ibp->rvp.mkey_lease_period * HZ;
+ /* Generate a trap notice. */
+ bad_mkey(ibp, mad, mkey, dr_slid, return_path,
+ hop_cnt);
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * The SMA caches reads from LCB registers in case the LCB is unavailable.
+ * (The LCB is unavailable in certain link states, for example.)
+ */
+struct lcb_datum {
+ u32 off;
+ u64 val;
+};
+
+static struct lcb_datum lcb_cache[] = {
+ { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
+};
+
+static int write_lcb_cache(u32 off, u64 val)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+ if (lcb_cache[i].off == off) {
+ lcb_cache[i].val = val;
+ return 0;
+ }
+ }
+
+ pr_warn("%s bad offset 0x%x\n", __func__, off);
+ return -1;
+}
+
+static int read_lcb_cache(u32 off, u64 *val)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+ if (lcb_cache[i].off == off) {
+ *val = lcb_cache[i].val;
+ return 0;
+ }
+ }
+
+ pr_warn("%s bad offset 0x%x\n", __func__, off);
+ return -1;
+}
+
+void read_ltp_rtt(struct hfi1_devdata *dd)
+{
+ u64 reg;
+
+ if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
+ dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
+ else
+ write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
+}
+
+static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int i;
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ibport *ibp;
+ struct opa_port_info *pi = (struct opa_port_info *)data;
+ u8 mtu;
+ u8 credit_rate;
+ u8 is_beaconing_active;
+ u32 state;
+ u32 num_ports = OPA_AM_NPORT(am);
+ u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+ u32 buffer_units;
+ u64 tmp = 0;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ dd = dd_from_ibdev(ibdev);
+ /* IB numbers ports from 1, hw from 0 */
+ ppd = dd->pport + (port - 1);
+ ibp = &ppd->ibport_data;
+
+ if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+ ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ pi->lid = cpu_to_be32(ppd->lid);
+
+ /* Only return the mkey if the protection field allows it. */
+ if (!(smp->method == IB_MGMT_METHOD_GET &&
+ ibp->rvp.mkey != smp->mkey &&
+ ibp->rvp.mkeyprot == 1))
+ pi->mkey = ibp->rvp.mkey;
+
+ pi->subnet_prefix = ibp->rvp.gid_prefix;
+ pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid);
+ pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+ pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period);
+ pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
+ pi->sa_qp = cpu_to_be32(ppd->sa_qp);
+
+ pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
+ pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
+ pi->link_width.active = cpu_to_be16(ppd->link_width_active);
+
+ pi->link_width_downgrade.supported =
+ cpu_to_be16(ppd->link_width_downgrade_supported);
+ pi->link_width_downgrade.enabled =
+ cpu_to_be16(ppd->link_width_downgrade_enabled);
+ pi->link_width_downgrade.tx_active =
+ cpu_to_be16(ppd->link_width_downgrade_tx_active);
+ pi->link_width_downgrade.rx_active =
+ cpu_to_be16(ppd->link_width_downgrade_rx_active);
+
+ pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
+ pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
+ pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
+
+ state = driver_lstate(ppd);
+
+ if (start_of_sm_config && (state == IB_PORT_INIT))
+ ppd->is_sm_config_started = 1;
+
+ pi->port_phys_conf = (ppd->port_type & 0xf);
+
+ pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+ pi->port_states.ledenable_offlinereason |=
+ ppd->is_sm_config_started << 5;
+ /*
+ * This pairs with the memory barrier in hfi1_start_led_override to
+ * ensure that we read the correct state of LED beaconing represented
+ * by led_override_timer_active
+ */
+ smp_rmb();
+ is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+ pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6;
+ pi->port_states.ledenable_offlinereason |=
+ ppd->offline_disabled_reason;
+
+ pi->port_states.portphysstate_portstate =
+ (hfi1_ibphys_portstate(ppd) << 4) | state;
+
+ pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc;
+
+ memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
+ for (i = 0; i < ppd->vls_supported; i++) {
+ mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
+ if ((i % 2) == 0)
+ pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4);
+ else
+ pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu;
+ }
+ /* don't forget VL 15 */
+ mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
+ pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu;
+ pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL;
+ pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
+ pi->partenforce_filterraw |=
+ (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
+ if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
+ pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
+ if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
+ pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
+ pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations);
+ /* P_KeyViolations are counted by hardware. */
+ pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations);
+ pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations);
+
+ pi->vl.cap = ppd->vls_supported;
+ pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit);
+ pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
+ pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
+
+ pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout;
+
+ pi->port_link_mode = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
+ OPA_PORT_LINK_MODE_OPA << 5 |
+ OPA_PORT_LINK_MODE_OPA);
+
+ pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
+
+ pi->port_mode = cpu_to_be16(
+ ppd->is_active_optimize_enabled ?
+ OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
+
+ pi->port_packet_format.supported =
+ cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+ pi->port_packet_format.enabled =
+ cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+
+ /* flit_control.interleave is (OPA V1, version .76):
+ * bits use
+ * ---- ---
+ * 2 res
+ * 2 DistanceSupported
+ * 2 DistanceEnabled
+ * 5 MaxNextLevelTxEnabled
+ * 5 MaxNestLevelRxSupported
+ *
+ * HFI supports only "distance mode 1" (see OPA V1, version .76,
+ * section 9.6.2), so set DistanceSupported, DistanceEnabled
+ * to 0x1.
+ */
+ pi->flit_control.interleave = cpu_to_be16(0x1400);
+
+ pi->link_down_reason = ppd->local_link_down_reason.sma;
+ pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
+ pi->port_error_action = cpu_to_be32(ppd->port_error_action);
+ pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
+
+ /* 32.768 usec. response time (guessing) */
+ pi->resptimevalue = 3;
+
+ pi->local_port_num = port;
+
+ /* buffer info for FM */
+ pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
+
+ pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
+ pi->neigh_port_num = ppd->neighbor_port_number;
+ pi->port_neigh_mode =
+ (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
+ (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
+ (ppd->neighbor_fm_security ?
+ OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
+
+ /* HFIs shall always return VL15 credits to their
+ * neighbor in a timely manner, without any credit return pacing.
+ */
+ credit_rate = 0;
+ buffer_units = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
+ buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
+ buffer_units |= (credit_rate << 6) &
+ OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
+ buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
+ pi->buffer_units = cpu_to_be32(buffer_units);
+
+ pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported);
+
+ /* HFI supports a replay buffer 128 LTPs in size */
+ pi->replay_depth.buffer = 0x80;
+ /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+ read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
+
+ /*
+ * this counter is 16 bits wide, but the replay_depth.wire
+ * variable is only 8 bits
+ */
+ if (tmp > 0xff)
+ tmp = 0xff;
+ pi->replay_depth.wire = tmp;
+
+ if (resp_len)
+ *resp_len += sizeof(struct opa_port_info);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+ struct hfi1_pportdata *ppd = dd->pport + port - 1;
+
+ memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
+
+ return 0;
+}
+
+static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u32 n_blocks_req = OPA_AM_NBLK(am);
+ u32 start_block = am & 0x7ff;
+ __be16 *p;
+ u16 *q;
+ int i;
+ u16 n_blocks_avail;
+ unsigned npkeys = hfi1_get_npkeys(dd);
+ size_t size;
+
+ if (n_blocks_req == 0) {
+ pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+ port, start_block, n_blocks_req);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+ size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
+
+ if (start_block + n_blocks_req > n_blocks_avail ||
+ n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+ pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
+ "avail 0x%x; blk/smp 0x%lx\n",
+ start_block, n_blocks_req, n_blocks_avail,
+ OPA_NUM_PKEY_BLOCKS_PER_SMP);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ p = (__be16 *)data;
+ q = (u16 *)data;
+ /* get the real pkeys if we are requesting the first block */
+ if (start_block == 0) {
+ get_pkeys(dd, port, q);
+ for (i = 0; i < npkeys; i++)
+ p[i] = cpu_to_be16(q[i]);
+ if (resp_len)
+ *resp_len += size;
+ } else {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+enum {
+ HFI_TRANSITION_DISALLOWED,
+ HFI_TRANSITION_IGNORED,
+ HFI_TRANSITION_ALLOWED,
+ HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * Use shortened names to improve readability of
+ * {logical,physical}_state_transitions
+ */
+enum {
+ __D = HFI_TRANSITION_DISALLOWED,
+ __I = HFI_TRANSITION_IGNORED,
+ __A = HFI_TRANSITION_ALLOWED,
+ __U = HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
+ * represented in physical_state_transitions.
+ */
+#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
+
+/*
+ * Within physical_state_transitions, rows represent "old" states,
+ * columns "new" states, and physical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 6-4).
+ */
+static const struct {
+ u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
+} physical_state_transitions = {
+ {
+ /* 2 3 4 5 6 7 8 9 10 11 */
+ /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
+ /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
+ /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+ /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
+ /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+ /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
+ /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+ /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
+ /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+ /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
+ }
+};
+
+/*
+ * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
+ * logical_state_transitions
+ */
+
+#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
+
+/*
+ * Within logical_state_transitions rows represent "old" states,
+ * columns "new" states, and logical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 9-12).
+ */
+static const struct {
+ u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
+} logical_state_transitions = {
+ {
+ /* 1 2 3 4 5 */
+ /* 1 */ { __I, __D, __D, __D, __U},
+ /* 2 */ { __D, __I, __A, __D, __U},
+ /* 3 */ { __D, __D, __I, __A, __U},
+ /* 4 */ { __D, __D, __I, __I, __U},
+ /* 5 */ { __U, __U, __U, __U, __U},
+ }
+};
+
+static int logical_transition_allowed(int old, int new)
+{
+ if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
+ new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
+ pr_warn("invalid logical state(s) (old %d new %d)\n",
+ old, new);
+ return HFI_TRANSITION_UNDEFINED;
+ }
+
+ if (new == IB_PORT_NOP)
+ return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+ /* adjust states for indexing into logical_state_transitions */
+ old -= IB_PORT_DOWN;
+ new -= IB_PORT_DOWN;
+
+ if (old < 0 || new < 0)
+ return HFI_TRANSITION_UNDEFINED;
+ return logical_state_transitions.allowed[old][new];
+}
+
+static int physical_transition_allowed(int old, int new)
+{
+ if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
+ new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
+ pr_warn("invalid physical state(s) (old %d new %d)\n",
+ old, new);
+ return HFI_TRANSITION_UNDEFINED;
+ }
+
+ if (new == IB_PORTPHYSSTATE_NOP)
+ return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+ /* adjust states for indexing into physical_state_transitions */
+ old -= IB_PORTPHYSSTATE_POLLING;
+ new -= IB_PORTPHYSSTATE_POLLING;
+
+ if (old < 0 || new < 0)
+ return HFI_TRANSITION_UNDEFINED;
+ return physical_state_transitions.allowed[old][new];
+}
+
+static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
+ u32 logical_new, u32 physical_new)
+{
+ u32 physical_old = driver_physical_state(ppd);
+ u32 logical_old = driver_logical_state(ppd);
+ int ret, logical_allowed, physical_allowed;
+
+ ret = logical_transition_allowed(logical_old, logical_new);
+ logical_allowed = ret;
+
+ if (ret == HFI_TRANSITION_DISALLOWED ||
+ ret == HFI_TRANSITION_UNDEFINED) {
+ pr_warn("invalid logical state transition %s -> %s\n",
+ opa_lstate_name(logical_old),
+ opa_lstate_name(logical_new));
+ return ret;
+ }
+
+ ret = physical_transition_allowed(physical_old, physical_new);
+ physical_allowed = ret;
+
+ if (ret == HFI_TRANSITION_DISALLOWED ||
+ ret == HFI_TRANSITION_UNDEFINED) {
+ pr_warn("invalid physical state transition %s -> %s\n",
+ opa_pstate_name(physical_old),
+ opa_pstate_name(physical_new));
+ return ret;
+ }
+
+ if (logical_allowed == HFI_TRANSITION_IGNORED &&
+ physical_allowed == HFI_TRANSITION_IGNORED)
+ return HFI_TRANSITION_IGNORED;
+
+ /*
+ * A change request of Physical Port State from
+ * 'Offline' to 'Polling' should be ignored.
+ */
+ if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) &&
+ (physical_new == IB_PORTPHYSSTATE_POLLING))
+ return HFI_TRANSITION_IGNORED;
+
+ /*
+ * Either physical_allowed or logical_allowed is
+ * HFI_TRANSITION_ALLOWED.
+ */
+ return HFI_TRANSITION_ALLOWED;
+}
+
+static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
+ u32 logical_state, u32 phys_state,
+ int suppress_idle_sma)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 link_state;
+ int ret;
+
+ ret = port_states_transition_allowed(ppd, logical_state, phys_state);
+ if (ret == HFI_TRANSITION_DISALLOWED ||
+ ret == HFI_TRANSITION_UNDEFINED) {
+ /* error message emitted above */
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return 0;
+ }
+
+ if (ret == HFI_TRANSITION_IGNORED)
+ return 0;
+
+ if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
+ !(logical_state == IB_PORT_DOWN ||
+ logical_state == IB_PORT_NOP)){
+ pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
+ logical_state, phys_state);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+
+ /*
+ * Logical state changes are summarized in OPAv1g1 spec.,
+ * Table 9-12; physical state changes are summarized in
+ * OPAv1g1 spec., Table 6.4.
+ */
+ switch (logical_state) {
+ case IB_PORT_NOP:
+ if (phys_state == IB_PORTPHYSSTATE_NOP)
+ break;
+ /* FALLTHROUGH */
+ case IB_PORT_DOWN:
+ if (phys_state == IB_PORTPHYSSTATE_NOP) {
+ link_state = HLS_DN_DOWNDEF;
+ } else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
+ link_state = HLS_DN_POLL;
+ set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE,
+ 0, OPA_LINKDOWN_REASON_FM_BOUNCE);
+ } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) {
+ link_state = HLS_DN_DISABLE;
+ } else {
+ pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
+ phys_state);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ break;
+ }
+
+ if ((link_state == HLS_DN_POLL ||
+ link_state == HLS_DN_DOWNDEF)) {
+ /*
+ * Going to poll. No matter what the current state,
+ * always move offline first, then tune and start the
+ * link. This correctly handles a FM link bounce and
+ * a link enable. Going offline is a no-op if already
+ * offline.
+ */
+ set_link_state(ppd, HLS_DN_OFFLINE);
+ tune_serdes(ppd);
+ start_link(ppd);
+ } else {
+ set_link_state(ppd, link_state);
+ }
+ if (link_state == HLS_DN_DISABLE &&
+ (ppd->offline_disabled_reason >
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) ||
+ ppd->offline_disabled_reason ==
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)))
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED);
+ /*
+ * Don't send a reply if the response would be sent
+ * through the disabled port.
+ */
+ if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ break;
+ case IB_PORT_ARMED:
+ ret = set_link_state(ppd, HLS_UP_ARMED);
+ if ((ret == 0) && (suppress_idle_sma == 0))
+ send_idle_sma(dd, SMA_IDLE_ARM);
+ break;
+ case IB_PORT_ACTIVE:
+ if (ppd->neighbor_normal) {
+ ret = set_link_state(ppd, HLS_UP_ACTIVE);
+ if (ret == 0)
+ send_idle_sma(dd, SMA_IDLE_ACTIVE);
+ } else {
+ pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+ break;
+ default:
+ pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
+ logical_state);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+
+ return 0;
+}
+
+/**
+ * subn_set_opa_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ */
+static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct opa_port_info *pi = (struct opa_port_info *)data;
+ struct ib_event event;
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ibport *ibp;
+ u8 clientrereg;
+ unsigned long flags;
+ u32 smlid, opa_lid; /* tmp vars to hold LID values */
+ u16 lid;
+ u8 ls_old, ls_new, ps_new;
+ u8 vls;
+ u8 msl;
+ u8 crc_enabled;
+ u16 lse, lwe, mtu;
+ u32 num_ports = OPA_AM_NPORT(am);
+ u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+ int ret, i, invalid = 0, call_set_mtu = 0;
+ int call_link_downgrade_policy = 0;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ opa_lid = be32_to_cpu(pi->lid);
+ if (opa_lid & 0xFFFF0000) {
+ pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ goto get_only;
+ }
+
+ lid = (u16)(opa_lid & 0x0000FFFF);
+
+ smlid = be32_to_cpu(pi->sm_lid);
+ if (smlid & 0xFFFF0000) {
+ pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ goto get_only;
+ }
+ smlid &= 0x0000FFFF;
+
+ clientrereg = (pi->clientrereg_subnettimeout &
+ OPA_PI_MASK_CLIENT_REREGISTER);
+
+ dd = dd_from_ibdev(ibdev);
+ /* IB numbers ports from 1, hw from 0 */
+ ppd = dd->pport + (port - 1);
+ ibp = &ppd->ibport_data;
+ event.device = ibdev;
+ event.element.port_num = port;
+
+ ls_old = driver_lstate(ppd);
+
+ ibp->rvp.mkey = pi->mkey;
+ ibp->rvp.gid_prefix = pi->subnet_prefix;
+ ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
+
+ /* Must be a valid unicast LID address. */
+ if ((lid == 0 && ls_old > IB_PORT_INIT) ||
+ lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
+ lid);
+ } else if (ppd->lid != lid ||
+ ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
+ if (ppd->lid != lid)
+ hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
+ if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
+ hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
+ hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
+ event.event = IB_EVENT_LID_CHANGE;
+ ib_dispatch_event(&event);
+ }
+
+ msl = pi->smsl & OPA_PI_MASK_SMSL;
+ if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
+ ppd->linkinit_reason =
+ (pi->partenforce_filterraw &
+ OPA_PI_MASK_LINKINIT_REASON);
+ /* enable/disable SW pkey checking as per FM control */
+ if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN)
+ ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
+ else
+ ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN;
+
+ if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT)
+ ppd->part_enforce |= HFI1_PART_ENFORCE_OUT;
+ else
+ ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT;
+
+ /* Must be a valid unicast LID address. */
+ if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
+ smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
+ } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) {
+ pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
+ spin_lock_irqsave(&ibp->rvp.lock, flags);
+ if (ibp->rvp.sm_ah) {
+ if (smlid != ibp->rvp.sm_lid)
+ ibp->rvp.sm_ah->attr.dlid = smlid;
+ if (msl != ibp->rvp.sm_sl)
+ ibp->rvp.sm_ah->attr.sl = msl;
+ }
+ spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+ if (smlid != ibp->rvp.sm_lid)
+ ibp->rvp.sm_lid = smlid;
+ if (msl != ibp->rvp.sm_sl)
+ ibp->rvp.sm_sl = msl;
+ event.event = IB_EVENT_SM_CHANGE;
+ ib_dispatch_event(&event);
+ }
+
+ if (pi->link_down_reason == 0) {
+ ppd->local_link_down_reason.sma = 0;
+ ppd->local_link_down_reason.latest = 0;
+ }
+
+ if (pi->neigh_link_down_reason == 0) {
+ ppd->neigh_link_down_reason.sma = 0;
+ ppd->neigh_link_down_reason.latest = 0;
+ }
+
+ ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
+ ppd->sa_qp = be32_to_cpu(pi->sa_qp);
+
+ ppd->port_error_action = be32_to_cpu(pi->port_error_action);
+ lwe = be16_to_cpu(pi->link_width.enabled);
+ if (lwe) {
+ if (lwe == OPA_LINK_WIDTH_RESET ||
+ lwe == OPA_LINK_WIDTH_RESET_OLD)
+ set_link_width_enabled(ppd, ppd->link_width_supported);
+ else if ((lwe & ~ppd->link_width_supported) == 0)
+ set_link_width_enabled(ppd, lwe);
+ else
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+ lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
+ /* LWD.E is always applied - 0 means "disabled" */
+ if (lwe == OPA_LINK_WIDTH_RESET ||
+ lwe == OPA_LINK_WIDTH_RESET_OLD) {
+ set_link_width_downgrade_enabled(ppd,
+ ppd->
+ link_width_downgrade_supported
+ );
+ } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
+ /* only set and apply if something changed */
+ if (lwe != ppd->link_width_downgrade_enabled) {
+ set_link_width_downgrade_enabled(ppd, lwe);
+ call_link_downgrade_policy = 1;
+ }
+ } else {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+ lse = be16_to_cpu(pi->link_speed.enabled);
+ if (lse) {
+ if (lse & be16_to_cpu(pi->link_speed.supported))
+ set_link_speed_enabled(ppd, lse);
+ else
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+
+ ibp->rvp.mkeyprot =
+ (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
+ ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
+ ibp->rvp.vl_high_limit);
+
+ if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+ ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+ for (i = 0; i < ppd->vls_supported; i++) {
+ if ((i % 2) == 0)
+ mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >>
+ 4) & 0xF);
+ else
+ mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] &
+ 0xF);
+ if (mtu == 0xffff) {
+ pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
+ mtu,
+ (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ mtu = hfi1_max_mtu; /* use a valid MTU */
+ }
+ if (dd->vld[i].mtu != mtu) {
+ dd_dev_info(dd,
+ "MTU change on vl %d from %d to %d\n",
+ i, dd->vld[i].mtu, mtu);
+ dd->vld[i].mtu = mtu;
+ call_set_mtu++;
+ }
+ }
+ /* As per OPAV1 spec: VL15 must support and be configured
+ * for operation with a 2048 or larger MTU.
+ */
+ mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF);
+ if (mtu < 2048 || mtu == 0xffff)
+ mtu = 2048;
+ if (dd->vld[15].mtu != mtu) {
+ dd_dev_info(dd,
+ "MTU change on vl 15 from %d to %d\n",
+ dd->vld[15].mtu, mtu);
+ dd->vld[15].mtu = mtu;
+ call_set_mtu++;
+ }
+ if (call_set_mtu)
+ set_mtu(ppd);
+
+ /* Set operational VLs */
+ vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
+ if (vls) {
+ if (vls > ppd->vls_supported) {
+ pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
+ pi->operational_vls);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ } else {
+ if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
+ vls) == -EINVAL)
+ smp->status |= IB_SMP_INVALID_FIELD;
+ }
+ }
+
+ if (pi->mkey_violations == 0)
+ ibp->rvp.mkey_violations = 0;
+
+ if (pi->pkey_violations == 0)
+ ibp->rvp.pkey_violations = 0;
+
+ if (pi->qkey_violations == 0)
+ ibp->rvp.qkey_violations = 0;
+
+ ibp->rvp.subnet_timeout =
+ pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
+
+ crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
+ crc_enabled >>= 4;
+ crc_enabled &= 0xf;
+
+ if (crc_enabled != 0)
+ ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
+
+ ppd->is_active_optimize_enabled =
+ !!(be16_to_cpu(pi->port_mode)
+ & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
+
+ ls_new = pi->port_states.portphysstate_portstate &
+ OPA_PI_MASK_PORT_STATE;
+ ps_new = (pi->port_states.portphysstate_portstate &
+ OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
+
+ if (ls_old == IB_PORT_INIT) {
+ if (start_of_sm_config) {
+ if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+ ppd->is_sm_config_started = 1;
+ } else if (ls_new == IB_PORT_ARMED) {
+ if (ppd->is_sm_config_started == 0)
+ invalid = 1;
+ }
+ }
+
+ /* Handle CLIENT_REREGISTER event b/c SM asked us for it */
+ if (clientrereg) {
+ event.event = IB_EVENT_CLIENT_REREGISTER;
+ ib_dispatch_event(&event);
+ }
+
+ /*
+ * Do the port state change now that the other link parameters
+ * have been set.
+ * Changing the port physical state only makes sense if the link
+ * is down or is being set to down.
+ */
+
+ ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+ if (ret)
+ return ret;
+
+ ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+
+ /* restore re-reg bit per o14-12.2.1 */
+ pi->clientrereg_subnettimeout |= clientrereg;
+
+ /*
+ * Apply the new link downgrade policy. This may result in a link
+ * bounce. Do this after everything else so things are settled.
+ * Possible problem: if setting the port state above fails, then
+ * the policy change is not applied.
+ */
+ if (call_link_downgrade_policy)
+ apply_link_downgrade_policy(ppd, 0);
+
+ return ret;
+
+get_only:
+ return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+ int changed = 0;
+ int update_includes_mgmt_partition = 0;
+
+ /*
+ * IB port one/two always maps to context zero/one,
+ * always a kernel context, no locking needed
+ * If we get here with ppd setup, no need to check
+ * that rcd is valid.
+ */
+ ppd = dd->pport + (port - 1);
+ /*
+ * If the update does not include the management pkey, don't do it.
+ */
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+ if (pkeys[i] == LIM_MGMT_P_KEY) {
+ update_includes_mgmt_partition = 1;
+ break;
+ }
+ }
+
+ if (!update_includes_mgmt_partition)
+ return 1;
+
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+ u16 key = pkeys[i];
+ u16 okey = ppd->pkeys[i];
+
+ if (key == okey)
+ continue;
+ /*
+ * Don't update pkeys[2], if an HFI port without MgmtAllowed
+ * by neighbor is a switch.
+ */
+ if (i == 2 && !ppd->mgmt_allowed && ppd->neighbor_type == 1)
+ continue;
+ /*
+ * The SM gives us the complete PKey table. We have
+ * to ensure that we put the PKeys in the matching
+ * slots.
+ */
+ ppd->pkeys[i] = key;
+ changed = 1;
+ }
+
+ if (changed) {
+ (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+ hfi1_event_pkey_change(dd, port);
+ }
+
+ return 0;
+}
+
+static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u32 n_blocks_sent = OPA_AM_NBLK(am);
+ u32 start_block = am & 0x7ff;
+ u16 *p = (u16 *)data;
+ __be16 *q = (__be16 *)data;
+ int i;
+ u16 n_blocks_avail;
+ unsigned npkeys = hfi1_get_npkeys(dd);
+
+ if (n_blocks_sent == 0) {
+ pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+ port, start_block, n_blocks_sent);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+ if (start_block + n_blocks_sent > n_blocks_avail ||
+ n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+ pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
+ start_block, n_blocks_sent, n_blocks_avail,
+ OPA_NUM_PKEY_BLOCKS_PER_SMP);
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
+ p[i] = be16_to_cpu(q[i]);
+
+ if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
+}
+
+static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+ u64 *val = data;
+
+ *val++ = read_csr(dd, SEND_SC2VLT0);
+ *val++ = read_csr(dd, SEND_SC2VLT1);
+ *val++ = read_csr(dd, SEND_SC2VLT2);
+ *val++ = read_csr(dd, SEND_SC2VLT3);
+ return 0;
+}
+
+#define ILLEGAL_VL 12
+/*
+ * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
+ * for SC15, which must map to VL15). If we don't remap things this
+ * way it is possible for VL15 counters to increment when we try to
+ * send on a SC which is mapped to an invalid VL.
+ */
+static void filter_sc2vlt(void *data)
+{
+ int i;
+ u8 *pd = data;
+
+ for (i = 0; i < OPA_MAX_SCS; i++) {
+ if (i == 15)
+ continue;
+ if ((pd[i] & 0x1f) == 0xf)
+ pd[i] = ILLEGAL_VL;
+ }
+}
+
+static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+ u64 *val = data;
+
+ filter_sc2vlt(data);
+
+ write_csr(dd, SEND_SC2VLT0, *val++);
+ write_csr(dd, SEND_SC2VLT1, *val++);
+ write_csr(dd, SEND_SC2VLT2, *val++);
+ write_csr(dd, SEND_SC2VLT3, *val++);
+ write_seqlock_irq(&dd->sc2vl_lock);
+ memcpy(dd->sc2vl, data, sizeof(dd->sc2vl));
+ write_sequnlock_irq(&dd->sc2vl_lock);
+ return 0;
+}
+
+static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *p = data;
+ size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
+ unsigned i;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
+ *p++ = ibp->sl_to_sc[i];
+
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *p = data;
+ int i;
+ u8 sc;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++) {
+ sc = *p++;
+ if (ibp->sl_to_sc[i] != sc) {
+ ibp->sl_to_sc[i] = sc;
+
+ /* Put all stale qps into error state */
+ hfi1_error_port_qps(ibp, i);
+ }
+ }
+
+ return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *p = data;
+ size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
+ unsigned i;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+ *p++ = ibp->sc_to_sl[i];
+
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *p = data;
+ int i;
+
+ if (am) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+ ibp->sc_to_sl[i] = *p++;
+
+ return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 n_blocks = OPA_AM_NBLK(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ void *vp = (void *)data;
+ size_t size = 4 * sizeof(u64);
+
+ if (n_blocks != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ get_sc2vlt_tables(dd, vp);
+
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 n_blocks = OPA_AM_NBLK(am);
+ int async_update = OPA_AM_ASYNC(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ void *vp = (void *)data;
+ struct hfi1_pportdata *ppd;
+ int lstate;
+
+ if (n_blocks != 1 || async_update) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /* IB numbers ports from 1, hw from 0 */
+ ppd = dd->pport + (port - 1);
+ lstate = driver_lstate(ppd);
+ /*
+ * it's known that async_update is 0 by this point, but include
+ * the explicit check for clarity
+ */
+ if (!async_update &&
+ (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ set_sc2vlt_tables(dd, vp);
+
+ return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 n_blocks = OPA_AM_NPORT(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_pportdata *ppd;
+ void *vp = (void *)data;
+ int size;
+
+ if (n_blocks != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ppd = dd->pport + (port - 1);
+
+ size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
+
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 n_blocks = OPA_AM_NPORT(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_pportdata *ppd;
+ void *vp = (void *)data;
+ int lstate;
+
+ if (n_blocks != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /* IB numbers ports from 1, hw from 0 */
+ ppd = dd->pport + (port - 1);
+ lstate = driver_lstate(ppd);
+ if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ppd = dd->pport + (port - 1);
+
+ fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
+
+ return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+ resp_len);
+}
+
+static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 nports = OPA_AM_NPORT(am);
+ u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+ u32 lstate;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
+
+ if (nports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ibp = to_iport(ibdev, port);
+ ppd = ppd_from_ibp(ibp);
+
+ lstate = driver_lstate(ppd);
+
+ if (start_of_sm_config && (lstate == IB_PORT_INIT))
+ ppd->is_sm_config_started = 1;
+
+ psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+ psi->port_states.ledenable_offlinereason |=
+ ppd->is_sm_config_started << 5;
+ psi->port_states.ledenable_offlinereason |=
+ ppd->offline_disabled_reason;
+
+ psi->port_states.portphysstate_portstate =
+ (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
+ psi->link_width_downgrade_tx_active =
+ cpu_to_be16(ppd->link_width_downgrade_tx_active);
+ psi->link_width_downgrade_rx_active =
+ cpu_to_be16(ppd->link_width_downgrade_rx_active);
+ if (resp_len)
+ *resp_len += sizeof(struct opa_port_state_info);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ u32 nports = OPA_AM_NPORT(am);
+ u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+ u32 ls_old;
+ u8 ls_new, ps_new;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct opa_port_state_info *psi = (struct opa_port_state_info *)data;
+ int ret, invalid = 0;
+
+ if (nports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ibp = to_iport(ibdev, port);
+ ppd = ppd_from_ibp(ibp);
+
+ ls_old = driver_lstate(ppd);
+
+ ls_new = port_states_to_logical_state(&psi->port_states);
+ ps_new = port_states_to_phys_state(&psi->port_states);
+
+ if (ls_old == IB_PORT_INIT) {
+ if (start_of_sm_config) {
+ if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+ ppd->is_sm_config_started = 1;
+ } else if (ls_new == IB_PORT_ARMED) {
+ if (ppd->is_sm_config_started == 0)
+ invalid = 1;
+ }
+ }
+
+ ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+ if (ret)
+ return ret;
+
+ if (invalid)
+ smp->status |= IB_SMP_INVALID_FIELD;
+
+ return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u32 addr = OPA_AM_CI_ADDR(am);
+ u32 len = OPA_AM_CI_LEN(am) + 1;
+ int ret;
+
+ if (dd->pport->port_type != PORT_TYPE_QSFP) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+#define __CI_PAGE_SIZE BIT(7) /* 128 bytes */
+#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
+#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
+
+ /*
+ * check that addr is within spec, and
+ * addr and (addr + len - 1) are on the same "page"
+ */
+ if (addr >= 4096 ||
+ (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ret = get_cable_info(dd, port, addr, len, data);
+
+ if (ret == -ENODEV) {
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /* The address range for the CableInfo SMA query is wider than the
+ * memory available on the QSFP cable. We want to return a valid
+ * response, albeit zeroed out, for address ranges beyond available
+ * memory but that are within the CableInfo query spec
+ */
+ if (ret < 0 && ret != -ERANGE) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ if (resp_len)
+ *resp_len += len;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ u32 num_ports = OPA_AM_NPORT(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_pportdata *ppd;
+ struct buffer_control *p = (struct buffer_control *)data;
+ int size;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ ppd = dd->pport + (port - 1);
+ size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
+ trace_bct_get(dd, p);
+ if (resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+ u32 num_ports = OPA_AM_NPORT(am);
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_pportdata *ppd;
+ struct buffer_control *p = (struct buffer_control *)data;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+ ppd = dd->pport + (port - 1);
+ trace_bct_set(dd, p);
+ if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+ u32 num_ports = OPA_AM_NPORT(am);
+ u8 section = (am & 0x00ff0000) >> 16;
+ u8 *p = data;
+ int size = 0;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ switch (section) {
+ case OPA_VLARB_LOW_ELEMENTS:
+ size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
+ break;
+ case OPA_VLARB_HIGH_ELEMENTS:
+ size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+ break;
+ case OPA_VLARB_PREEMPT_ELEMENTS:
+ size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
+ break;
+ case OPA_VLARB_PREEMPT_MATRIX:
+ size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
+ break;
+ default:
+ pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
+ be32_to_cpu(smp->attr_mod));
+ smp->status |= IB_SMP_INVALID_FIELD;
+ break;
+ }
+
+ if (size > 0 && resp_len)
+ *resp_len += size;
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+ u32 num_ports = OPA_AM_NPORT(am);
+ u8 section = (am & 0x00ff0000) >> 16;
+ u8 *p = data;
+
+ if (num_ports != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ switch (section) {
+ case OPA_VLARB_LOW_ELEMENTS:
+ (void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
+ break;
+ case OPA_VLARB_HIGH_ELEMENTS:
+ (void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+ break;
+ /*
+ * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
+ * can be changed from the default values
+ */
+ case OPA_VLARB_PREEMPT_ELEMENTS:
+ /* FALLTHROUGH */
+ case OPA_VLARB_PREEMPT_MATRIX:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ break;
+ default:
+ pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
+ be32_to_cpu(smp->attr_mod));
+ smp->status |= IB_SMP_INVALID_FIELD;
+ break;
+ }
+
+ return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_pma_mad {
+ struct ib_mad_hdr mad_hdr;
+ u8 data[2024];
+} __packed;
+
+struct opa_class_port_info {
+ u8 base_version;
+ u8 class_version;
+ __be16 cap_mask;
+ __be32 cap_mask2_resp_time;
+
+ u8 redirect_gid[16];
+ __be32 redirect_tc_fl;
+ __be32 redirect_lid;
+ __be32 redirect_sl_qp;
+ __be32 redirect_qkey;
+
+ u8 trap_gid[16];
+ __be32 trap_tc_fl;
+ __be32 trap_lid;
+ __be32 trap_hl_qp;
+ __be32 trap_qkey;
+
+ __be16 trap_pkey;
+ __be16 redirect_pkey;
+
+ u8 trap_sl_rsvd;
+ u8 reserved[3];
+} __packed;
+
+struct opa_port_status_req {
+ __u8 port_num;
+ __u8 reserved[3];
+ __be32 vl_select_mask;
+};
+
+#define VL_MASK_ALL 0x000080ff
+
+struct opa_port_status_rsp {
+ __u8 port_num;
+ __u8 reserved[3];
+ __be32 vl_select_mask;
+
+ /* Data counters */
+ __be64 port_xmit_data;
+ __be64 port_rcv_data;
+ __be64 port_xmit_pkts;
+ __be64 port_rcv_pkts;
+ __be64 port_multicast_xmit_pkts;
+ __be64 port_multicast_rcv_pkts;
+ __be64 port_xmit_wait;
+ __be64 sw_port_congestion;
+ __be64 port_rcv_fecn;
+ __be64 port_rcv_becn;
+ __be64 port_xmit_time_cong;
+ __be64 port_xmit_wasted_bw;
+ __be64 port_xmit_wait_data;
+ __be64 port_rcv_bubble;
+ __be64 port_mark_fecn;
+ /* Error counters */
+ __be64 port_rcv_constraint_errors;
+ __be64 port_rcv_switch_relay_errors;
+ __be64 port_xmit_discards;
+ __be64 port_xmit_constraint_errors;
+ __be64 port_rcv_remote_physical_errors;
+ __be64 local_link_integrity_errors;
+ __be64 port_rcv_errors;
+ __be64 excessive_buffer_overruns;
+ __be64 fm_config_errors;
+ __be32 link_error_recovery;
+ __be32 link_downed;
+ u8 uncorrectable_errors;
+
+ u8 link_quality_indicator; /* 5res, 3bit */
+ u8 res2[6];
+ struct _vls_pctrs {
+ /* per-VL Data counters */
+ __be64 port_vl_xmit_data;
+ __be64 port_vl_rcv_data;
+ __be64 port_vl_xmit_pkts;
+ __be64 port_vl_rcv_pkts;
+ __be64 port_vl_xmit_wait;
+ __be64 sw_port_vl_congestion;
+ __be64 port_vl_rcv_fecn;
+ __be64 port_vl_rcv_becn;
+ __be64 port_xmit_time_cong;
+ __be64 port_vl_xmit_wasted_bw;
+ __be64 port_vl_xmit_wait_data;
+ __be64 port_vl_rcv_bubble;
+ __be64 port_vl_mark_fecn;
+ __be64 port_vl_xmit_discards;
+ } vls[0]; /* real array size defined by # bits set in vl_select_mask */
+};
+
+enum counter_selects {
+ CS_PORT_XMIT_DATA = (1 << 31),
+ CS_PORT_RCV_DATA = (1 << 30),
+ CS_PORT_XMIT_PKTS = (1 << 29),
+ CS_PORT_RCV_PKTS = (1 << 28),
+ CS_PORT_MCAST_XMIT_PKTS = (1 << 27),
+ CS_PORT_MCAST_RCV_PKTS = (1 << 26),
+ CS_PORT_XMIT_WAIT = (1 << 25),
+ CS_SW_PORT_CONGESTION = (1 << 24),
+ CS_PORT_RCV_FECN = (1 << 23),
+ CS_PORT_RCV_BECN = (1 << 22),
+ CS_PORT_XMIT_TIME_CONG = (1 << 21),
+ CS_PORT_XMIT_WASTED_BW = (1 << 20),
+ CS_PORT_XMIT_WAIT_DATA = (1 << 19),
+ CS_PORT_RCV_BUBBLE = (1 << 18),
+ CS_PORT_MARK_FECN = (1 << 17),
+ CS_PORT_RCV_CONSTRAINT_ERRORS = (1 << 16),
+ CS_PORT_RCV_SWITCH_RELAY_ERRORS = (1 << 15),
+ CS_PORT_XMIT_DISCARDS = (1 << 14),
+ CS_PORT_XMIT_CONSTRAINT_ERRORS = (1 << 13),
+ CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS = (1 << 12),
+ CS_LOCAL_LINK_INTEGRITY_ERRORS = (1 << 11),
+ CS_PORT_RCV_ERRORS = (1 << 10),
+ CS_EXCESSIVE_BUFFER_OVERRUNS = (1 << 9),
+ CS_FM_CONFIG_ERRORS = (1 << 8),
+ CS_LINK_ERROR_RECOVERY = (1 << 7),
+ CS_LINK_DOWNED = (1 << 6),
+ CS_UNCORRECTABLE_ERRORS = (1 << 5),
+};
+
+struct opa_clear_port_status {
+ __be64 port_select_mask[4];
+ __be32 counter_select_mask;
+};
+
+struct opa_aggregate {
+ __be16 attr_id;
+ __be16 err_reqlength; /* 1 bit, 8 res, 7 bit */
+ __be32 attr_mod;
+ u8 data[0];
+};
+
+#define MSK_LLI 0x000000f0
+#define MSK_LLI_SFT 4
+#define MSK_LER 0x0000000f
+#define MSK_LER_SFT 0
+#define ADD_LLI 8
+#define ADD_LER 2
+
+/* Request contains first three fields, response contains those plus the rest */
+struct opa_port_data_counters_msg {
+ __be64 port_select_mask[4];
+ __be32 vl_select_mask;
+ __be32 resolution;
+
+ /* Response fields follow */
+ struct _port_dctrs {
+ u8 port_number;
+ u8 reserved2[3];
+ __be32 link_quality_indicator; /* 29res, 3bit */
+
+ /* Data counters */
+ __be64 port_xmit_data;
+ __be64 port_rcv_data;
+ __be64 port_xmit_pkts;
+ __be64 port_rcv_pkts;
+ __be64 port_multicast_xmit_pkts;
+ __be64 port_multicast_rcv_pkts;
+ __be64 port_xmit_wait;
+ __be64 sw_port_congestion;
+ __be64 port_rcv_fecn;
+ __be64 port_rcv_becn;
+ __be64 port_xmit_time_cong;
+ __be64 port_xmit_wasted_bw;
+ __be64 port_xmit_wait_data;
+ __be64 port_rcv_bubble;
+ __be64 port_mark_fecn;
+
+ __be64 port_error_counter_summary;
+ /* Sum of error counts/port */
+
+ struct _vls_dctrs {
+ /* per-VL Data counters */
+ __be64 port_vl_xmit_data;
+ __be64 port_vl_rcv_data;
+ __be64 port_vl_xmit_pkts;
+ __be64 port_vl_rcv_pkts;
+ __be64 port_vl_xmit_wait;
+ __be64 sw_port_vl_congestion;
+ __be64 port_vl_rcv_fecn;
+ __be64 port_vl_rcv_becn;
+ __be64 port_xmit_time_cong;
+ __be64 port_vl_xmit_wasted_bw;
+ __be64 port_vl_xmit_wait_data;
+ __be64 port_vl_rcv_bubble;
+ __be64 port_vl_mark_fecn;
+ } vls[0];
+ /* array size defined by #bits set in vl_select_mask*/
+ } port[1]; /* array size defined by #ports in attribute modifier */
+};
+
+struct opa_port_error_counters64_msg {
+ /*
+ * Request contains first two fields, response contains the
+ * whole magilla
+ */
+ __be64 port_select_mask[4];
+ __be32 vl_select_mask;
+
+ /* Response-only fields follow */
+ __be32 reserved1;
+ struct _port_ectrs {
+ u8 port_number;
+ u8 reserved2[7];
+ __be64 port_rcv_constraint_errors;
+ __be64 port_rcv_switch_relay_errors;
+ __be64 port_xmit_discards;
+ __be64 port_xmit_constraint_errors;
+ __be64 port_rcv_remote_physical_errors;
+ __be64 local_link_integrity_errors;
+ __be64 port_rcv_errors;
+ __be64 excessive_buffer_overruns;
+ __be64 fm_config_errors;
+ __be32 link_error_recovery;
+ __be32 link_downed;
+ u8 uncorrectable_errors;
+ u8 reserved3[7];
+ struct _vls_ectrs {
+ __be64 port_vl_xmit_discards;
+ } vls[0];
+ /* array size defined by #bits set in vl_select_mask */
+ } port[1]; /* array size defined by #ports in attribute modifier */
+};
+
+struct opa_port_error_info_msg {
+ __be64 port_select_mask[4];
+ __be32 error_info_select_mask;
+ __be32 reserved1;
+ struct _port_ei {
+ u8 port_number;
+ u8 reserved2[7];
+
+ /* PortRcvErrorInfo */
+ struct {
+ u8 status_and_code;
+ union {
+ u8 raw[17];
+ struct {
+ /* EI1to12 format */
+ u8 packet_flit1[8];
+ u8 packet_flit2[8];
+ u8 remaining_flit_bits12;
+ } ei1to12;
+ struct {
+ u8 packet_bytes[8];
+ u8 remaining_flit_bits;
+ } ei13;
+ } ei;
+ u8 reserved3[6];
+ } __packed port_rcv_ei;
+
+ /* ExcessiveBufferOverrunInfo */
+ struct {
+ u8 status_and_sc;
+ u8 reserved4[7];
+ } __packed excessive_buffer_overrun_ei;
+
+ /* PortXmitConstraintErrorInfo */
+ struct {
+ u8 status;
+ u8 reserved5;
+ __be16 pkey;
+ __be32 slid;
+ } __packed port_xmit_constraint_ei;
+
+ /* PortRcvConstraintErrorInfo */
+ struct {
+ u8 status;
+ u8 reserved6;
+ __be16 pkey;
+ __be32 slid;
+ } __packed port_rcv_constraint_ei;
+
+ /* PortRcvSwitchRelayErrorInfo */
+ struct {
+ u8 status_and_code;
+ u8 reserved7[3];
+ __u32 error_info;
+ } __packed port_rcv_switch_relay_ei;
+
+ /* UncorrectableErrorInfo */
+ struct {
+ u8 status_and_code;
+ u8 reserved8;
+ } __packed uncorrectable_ei;
+
+ /* FMConfigErrorInfo */
+ struct {
+ u8 status_and_code;
+ u8 error_info;
+ } __packed fm_config_ei;
+ __u32 reserved9;
+ } port[1]; /* actual array size defined by #ports in attr modifier */
+};
+
+/* opa_port_error_info_msg error_info_select_mask bit definitions */
+enum error_info_selects {
+ ES_PORT_RCV_ERROR_INFO = (1 << 31),
+ ES_EXCESSIVE_BUFFER_OVERRUN_INFO = (1 << 30),
+ ES_PORT_XMIT_CONSTRAINT_ERROR_INFO = (1 << 29),
+ ES_PORT_RCV_CONSTRAINT_ERROR_INFO = (1 << 28),
+ ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO = (1 << 27),
+ ES_UNCORRECTABLE_ERROR_INFO = (1 << 26),
+ ES_FM_CONFIG_ERROR_INFO = (1 << 25)
+};
+
+static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev, u32 *resp_len)
+{
+ struct opa_class_port_info *p =
+ (struct opa_class_port_info *)pmp->data;
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+
+ if (pmp->mad_hdr.attr_mod != 0)
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+ p->base_version = OPA_MGMT_BASE_VERSION;
+ p->class_version = OPA_SMI_CLASS_VERSION;
+ /*
+ * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+ */
+ p->cap_mask2_resp_time = cpu_to_be32(18);
+
+ if (resp_len)
+ *resp_len += sizeof(*p);
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void a0_portstatus(struct hfi1_pportdata *ppd,
+ struct opa_port_status_rsp *rsp, u32 vl_select_mask)
+{
+ if (!is_bx(ppd->dd)) {
+ unsigned long vl;
+ u64 sum_vl_xmit_wait = 0;
+ u32 vl_all_mask = VL_MASK_ALL;
+
+ for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+ 8 * sizeof(vl_all_mask)) {
+ u64 tmp = sum_vl_xmit_wait +
+ read_port_cntr(ppd, C_TX_WAIT_VL,
+ idx_from_vl(vl));
+ if (tmp < sum_vl_xmit_wait) {
+ /* we wrapped */
+ sum_vl_xmit_wait = (u64)~0;
+ break;
+ }
+ sum_vl_xmit_wait = tmp;
+ }
+ if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+ rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+ }
+}
+
+static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ struct opa_port_status_req *req =
+ (struct opa_port_status_req *)pmp->data;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct opa_port_status_rsp *rsp;
+ u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
+ unsigned long vl;
+ size_t response_data_size;
+ u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+ u8 port_num = req->port_num;
+ u8 num_vls = hweight32(vl_select_mask);
+ struct _vls_pctrs *vlinfo;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ int vfi;
+ u64 tmp, tmp2;
+
+ response_data_size = sizeof(struct opa_port_status_rsp) +
+ num_vls * sizeof(struct _vls_pctrs);
+ if (response_data_size > sizeof(pmp->data)) {
+ pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ if (nports != 1 || (port_num && port_num != port) ||
+ num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ memset(pmp->data, 0, sizeof(pmp->data));
+
+ rsp = (struct opa_port_status_rsp *)pmp->data;
+ if (port_num)
+ rsp->port_num = port_num;
+ else
+ rsp->port_num = port;
+
+ rsp->port_rcv_constraint_errors =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+ CNTR_INVALID_VL));
+
+ hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
+
+ rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
+ rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+ CNTR_INVALID_VL));
+ rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_multicast_xmit_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_multicast_rcv_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_xmit_wait =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+ rsp->port_rcv_fecn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+ rsp->port_rcv_becn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+ rsp->port_xmit_discards =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+ CNTR_INVALID_VL));
+ rsp->port_xmit_constraint_errors =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_remote_physical_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+ CNTR_INVALID_VL));
+ rsp->local_link_integrity_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY,
+ CNTR_INVALID_VL));
+ tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+ tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+ CNTR_INVALID_VL);
+ if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+ /* overflow/wrapped */
+ rsp->link_error_recovery = cpu_to_be32(~0);
+ } else {
+ rsp->link_error_recovery = cpu_to_be32(tmp2);
+ }
+ rsp->port_rcv_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+ rsp->excessive_buffer_overruns =
+ cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+ rsp->fm_config_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+ CNTR_INVALID_VL));
+ rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+ CNTR_INVALID_VL));
+
+ /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
+ tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+ rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+ vlinfo = &rsp->vls[0];
+ vfi = 0;
+ /* The vl_select_mask has been checked above, and we know
+ * that it contains only entries which represent valid VLs.
+ * So in the for_each_set_bit() loop below, we don't need
+ * any additional checks for vl.
+ */
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(vl_select_mask)) {
+ memset(vlinfo, 0, sizeof(*vlinfo));
+
+ tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
+ rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
+
+ rsp->vls[vfi].port_vl_rcv_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_data =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_pkts =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_wait =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_fecn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_becn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_discards =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+ idx_from_vl(vl)));
+ vlinfo++;
+ vfi++;
+ }
+
+ a0_portstatus(ppd, rsp, vl_select_mask);
+
+ if (resp_len)
+ *resp_len += response_data_size;
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port,
+ u8 res_lli, u8 res_ler)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u64 error_counter_summary = 0, tmp;
+
+ error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+ CNTR_INVALID_VL);
+ /* port_rcv_switch_relay_errors is 0 for HFIs */
+ error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+ CNTR_INVALID_VL);
+ /* local link integrity must be right-shifted by the lli resolution */
+ error_counter_summary += (read_dev_cntr(dd, C_DC_RX_REPLAY,
+ CNTR_INVALID_VL) >> res_lli);
+ /* link error recovery must b right-shifted by the ler resolution */
+ tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+ tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL);
+ error_counter_summary += (tmp >> res_ler);
+ error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
+ CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+ error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+ CNTR_INVALID_VL);
+ /* ppd->link_downed is a 32-bit value */
+ error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
+ CNTR_INVALID_VL);
+ tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+ /* this is an 8-bit quantity */
+ error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+ return error_counter_summary;
+}
+
+static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp,
+ u32 vl_select_mask)
+{
+ if (!is_bx(ppd->dd)) {
+ unsigned long vl;
+ u64 sum_vl_xmit_wait = 0;
+ u32 vl_all_mask = VL_MASK_ALL;
+
+ for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+ 8 * sizeof(vl_all_mask)) {
+ u64 tmp = sum_vl_xmit_wait +
+ read_port_cntr(ppd, C_TX_WAIT_VL,
+ idx_from_vl(vl));
+ if (tmp < sum_vl_xmit_wait) {
+ /* we wrapped */
+ sum_vl_xmit_wait = (u64)~0;
+ break;
+ }
+ sum_vl_xmit_wait = tmp;
+ }
+ if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+ rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+ }
+}
+
+static void pma_get_opa_port_dctrs(struct ib_device *ibdev,
+ struct _port_dctrs *rsp)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+
+ rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+ CNTR_INVALID_VL));
+ rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_multicast_xmit_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+ CNTR_INVALID_VL));
+ rsp->port_multicast_rcv_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+ CNTR_INVALID_VL));
+}
+
+static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ struct opa_port_data_counters_msg *req =
+ (struct opa_port_data_counters_msg *)pmp->data;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct _port_dctrs *rsp;
+ struct _vls_dctrs *vlinfo;
+ size_t response_data_size;
+ u32 num_ports;
+ u8 num_pslm;
+ u8 lq, num_vls;
+ u8 res_lli, res_ler;
+ u64 port_mask;
+ u8 port_num;
+ unsigned long vl;
+ u32 vl_select_mask;
+ int vfi;
+
+ num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+ num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+ num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+ vl_select_mask = be32_to_cpu(req->vl_select_mask);
+ res_lli = (u8)(be32_to_cpu(req->resolution) & MSK_LLI) >> MSK_LLI_SFT;
+ res_lli = res_lli ? res_lli + ADD_LLI : 0;
+ res_ler = (u8)(be32_to_cpu(req->resolution) & MSK_LER) >> MSK_LER_SFT;
+ res_ler = res_ler ? res_ler + ADD_LER : 0;
+
+ if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /* Sanity check */
+ response_data_size = sizeof(struct opa_port_data_counters_msg) +
+ num_vls * sizeof(struct _vls_dctrs);
+
+ if (response_data_size > sizeof(pmp->data)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /*
+ * The bit set in the mask needs to be consistent with the
+ * port the request came in on.
+ */
+ port_mask = be64_to_cpu(req->port_select_mask[3]);
+ port_num = find_first_bit((unsigned long *)&port_mask,
+ sizeof(port_mask) * 8);
+
+ if (port_num != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ rsp = &req->port[0];
+ memset(rsp, 0, sizeof(*rsp));
+
+ rsp->port_number = port;
+ /*
+ * Note that link_quality_indicator is a 32 bit quantity in
+ * 'datacounters' queries (as opposed to 'portinfo' queries,
+ * where it's a byte).
+ */
+ hfi1_read_link_quality(dd, &lq);
+ rsp->link_quality_indicator = cpu_to_be32((u32)lq);
+ pma_get_opa_port_dctrs(ibdev, rsp);
+
+ rsp->port_xmit_wait =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+ rsp->port_rcv_fecn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+ rsp->port_rcv_becn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+ rsp->port_error_counter_summary =
+ cpu_to_be64(get_error_counter_summary(ibdev, port,
+ res_lli, res_ler));
+
+ vlinfo = &rsp->vls[0];
+ vfi = 0;
+ /* The vl_select_mask has been checked above, and we know
+ * that it contains only entries which represent valid VLs.
+ * So in the for_each_set_bit() loop below, we don't need
+ * any additional checks for vl.
+ */
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(req->vl_select_mask)) {
+ memset(vlinfo, 0, sizeof(*vlinfo));
+
+ rsp->vls[vfi].port_vl_xmit_data =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_data =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_pkts =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_pkts =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_xmit_wait =
+ cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+ idx_from_vl(vl)));
+
+ rsp->vls[vfi].port_vl_rcv_fecn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+ idx_from_vl(vl)));
+ rsp->vls[vfi].port_vl_rcv_becn =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+ idx_from_vl(vl)));
+
+ /* rsp->port_vl_xmit_time_cong is 0 for HFIs */
+ /* rsp->port_vl_xmit_wasted_bw ??? */
+ /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
+ * does this differ from rsp->vls[vfi].port_vl_xmit_wait
+ */
+ /*rsp->vls[vfi].port_vl_mark_fecn =
+ * cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
+ * + offset));
+ */
+ vlinfo++;
+ vfi++;
+ }
+
+ a0_datacounters(ppd, rsp, vl_select_mask);
+
+ if (resp_len)
+ *resp_len += response_data_size;
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *)
+ pmp->data;
+ struct _port_dctrs rsp;
+
+ if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ goto bail;
+ }
+
+ memset(&rsp, 0, sizeof(rsp));
+ pma_get_opa_port_dctrs(ibdev, &rsp);
+
+ p->port_xmit_data = rsp.port_xmit_data;
+ p->port_rcv_data = rsp.port_rcv_data;
+ p->port_xmit_packets = rsp.port_xmit_pkts;
+ p->port_rcv_packets = rsp.port_rcv_pkts;
+ p->port_unicast_xmit_packets = 0;
+ p->port_unicast_rcv_packets = 0;
+ p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts;
+ p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts;
+
+bail:
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
+ struct _port_ectrs *rsp, u8 port)
+{
+ u64 tmp, tmp2;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+ tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+ CNTR_INVALID_VL);
+ if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+ /* overflow/wrapped */
+ rsp->link_error_recovery = cpu_to_be32(~0);
+ } else {
+ rsp->link_error_recovery = cpu_to_be32(tmp2);
+ }
+
+ rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+ rsp->port_rcv_remote_physical_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_switch_relay_errors = 0;
+ rsp->port_xmit_discards =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+ CNTR_INVALID_VL));
+ rsp->port_xmit_constraint_errors =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+ CNTR_INVALID_VL));
+ rsp->port_rcv_constraint_errors =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+ CNTR_INVALID_VL));
+ rsp->local_link_integrity_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY,
+ CNTR_INVALID_VL));
+ rsp->excessive_buffer_overruns =
+ cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+}
+
+static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ size_t response_data_size;
+ struct _port_ectrs *rsp;
+ u8 port_num;
+ struct opa_port_error_counters64_msg *req;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u32 num_ports;
+ u8 num_pslm;
+ u8 num_vls;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct _vls_ectrs *vlinfo;
+ unsigned long vl;
+ u64 port_mask, tmp;
+ u32 vl_select_mask;
+ int vfi;
+
+ req = (struct opa_port_error_counters64_msg *)pmp->data;
+
+ num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+
+ num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+ num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+
+ if (num_ports != 1 || num_ports != num_pslm) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ response_data_size = sizeof(struct opa_port_error_counters64_msg) +
+ num_vls * sizeof(struct _vls_ectrs);
+
+ if (response_data_size > sizeof(pmp->data)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+ /*
+ * The bit set in the mask needs to be consistent with the
+ * port the request came in on.
+ */
+ port_mask = be64_to_cpu(req->port_select_mask[3]);
+ port_num = find_first_bit((unsigned long *)&port_mask,
+ sizeof(port_mask) * 8);
+
+ if (port_num != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ rsp = &req->port[0];
+
+ ibp = to_iport(ibdev, port_num);
+ ppd = ppd_from_ibp(ibp);
+
+ memset(rsp, 0, sizeof(*rsp));
+ rsp->port_number = port_num;
+
+ pma_get_opa_port_ectrs(ibdev, rsp, port_num);
+
+ rsp->port_rcv_remote_physical_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+ CNTR_INVALID_VL));
+ rsp->fm_config_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+ CNTR_INVALID_VL));
+ tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+
+ rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+ rsp->port_rcv_errors =
+ cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+ vlinfo = &rsp->vls[0];
+ vfi = 0;
+ vl_select_mask = be32_to_cpu(req->vl_select_mask);
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(req->vl_select_mask)) {
+ memset(vlinfo, 0, sizeof(*vlinfo));
+ rsp->vls[vfi].port_vl_xmit_discards =
+ cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+ idx_from_vl(vl)));
+ vlinfo += 1;
+ vfi++;
+ }
+
+ if (resp_len)
+ *resp_len += response_data_size;
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_ib_portcounters(struct ib_pma_mad *pmp,
+ struct ib_device *ibdev, u8 port)
+{
+ struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+ pmp->data;
+ struct _port_ectrs rsp;
+ u64 temp_link_overrun_errors;
+ u64 temp_64;
+ u32 temp_32;
+
+ memset(&rsp, 0, sizeof(rsp));
+ pma_get_opa_port_ectrs(ibdev, &rsp, port);
+
+ if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ goto bail;
+ }
+
+ p->symbol_error_counter = 0; /* N/A for OPA */
+
+ temp_32 = be32_to_cpu(rsp.link_error_recovery);
+ if (temp_32 > 0xFFUL)
+ p->link_error_recovery_counter = 0xFF;
+ else
+ p->link_error_recovery_counter = (u8)temp_32;
+
+ temp_32 = be32_to_cpu(rsp.link_downed);
+ if (temp_32 > 0xFFUL)
+ p->link_downed_counter = 0xFF;
+ else
+ p->link_downed_counter = (u8)temp_32;
+
+ temp_64 = be64_to_cpu(rsp.port_rcv_errors);
+ if (temp_64 > 0xFFFFUL)
+ p->port_rcv_errors = cpu_to_be16(0xFFFF);
+ else
+ p->port_rcv_errors = cpu_to_be16((u16)temp_64);
+
+ temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors);
+ if (temp_64 > 0xFFFFUL)
+ p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+ else
+ p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64);
+
+ temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors);
+ p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64);
+
+ temp_64 = be64_to_cpu(rsp.port_xmit_discards);
+ if (temp_64 > 0xFFFFUL)
+ p->port_xmit_discards = cpu_to_be16(0xFFFF);
+ else
+ p->port_xmit_discards = cpu_to_be16((u16)temp_64);
+
+ temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors);
+ if (temp_64 > 0xFFUL)
+ p->port_xmit_constraint_errors = 0xFF;
+ else
+ p->port_xmit_constraint_errors = (u8)temp_64;
+
+ temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors);
+ if (temp_64 > 0xFFUL)
+ p->port_rcv_constraint_errors = 0xFFUL;
+ else
+ p->port_rcv_constraint_errors = (u8)temp_64;
+
+ /* LocalLink: 7:4, BufferOverrun: 3:0 */
+ temp_64 = be64_to_cpu(rsp.local_link_integrity_errors);
+ if (temp_64 > 0xFUL)
+ temp_64 = 0xFUL;
+
+ temp_link_overrun_errors = temp_64 << 4;
+
+ temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns);
+ if (temp_64 > 0xFUL)
+ temp_64 = 0xFUL;
+ temp_link_overrun_errors |= temp_64;
+
+ p->link_overrun_errors = (u8)temp_link_overrun_errors;
+
+ p->vl15_dropped = 0; /* N/A for OPA */
+
+bail:
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ size_t response_data_size;
+ struct _port_ei *rsp;
+ struct opa_port_error_info_msg *req;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u64 port_mask;
+ u32 num_ports;
+ u8 port_num;
+ u8 num_pslm;
+ u64 reg;
+
+ req = (struct opa_port_error_info_msg *)pmp->data;
+ rsp = &req->port[0];
+
+ num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+ num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+ memset(rsp, 0, sizeof(*rsp));
+
+ if (num_ports != 1 || num_ports != num_pslm) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /* Sanity check */
+ response_data_size = sizeof(struct opa_port_error_info_msg);
+
+ if (response_data_size > sizeof(pmp->data)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /*
+ * The bit set in the mask needs to be consistent with the port
+ * the request came in on.
+ */
+ port_mask = be64_to_cpu(req->port_select_mask[3]);
+ port_num = find_first_bit((unsigned long *)&port_mask,
+ sizeof(port_mask) * 8);
+
+ if (port_num != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /* PortRcvErrorInfo */
+ rsp->port_rcv_ei.status_and_code =
+ dd->err_info_rcvport.status_and_code;
+ memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
+ &dd->err_info_rcvport.packet_flit1, sizeof(u64));
+ memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
+ &dd->err_info_rcvport.packet_flit2, sizeof(u64));
+
+ /* ExcessiverBufferOverrunInfo */
+ reg = read_csr(dd, RCV_ERR_INFO);
+ if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
+ /*
+ * if the RcvExcessBufferOverrun bit is set, save SC of
+ * first pkt that encountered an excess buffer overrun
+ */
+ u8 tmp = (u8)reg;
+
+ tmp &= RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
+ tmp <<= 2;
+ rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
+ /* set the status bit */
+ rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
+ }
+
+ rsp->port_xmit_constraint_ei.status =
+ dd->err_info_xmit_constraint.status;
+ rsp->port_xmit_constraint_ei.pkey =
+ cpu_to_be16(dd->err_info_xmit_constraint.pkey);
+ rsp->port_xmit_constraint_ei.slid =
+ cpu_to_be32(dd->err_info_xmit_constraint.slid);
+
+ rsp->port_rcv_constraint_ei.status =
+ dd->err_info_rcv_constraint.status;
+ rsp->port_rcv_constraint_ei.pkey =
+ cpu_to_be16(dd->err_info_rcv_constraint.pkey);
+ rsp->port_rcv_constraint_ei.slid =
+ cpu_to_be32(dd->err_info_rcv_constraint.slid);
+
+ /* UncorrectableErrorInfo */
+ rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
+
+ /* FMConfigErrorInfo */
+ rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
+
+ if (resp_len)
+ *resp_len += response_data_size;
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ struct opa_clear_port_status *req =
+ (struct opa_clear_port_status *)pmp->data;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+ u64 portn = be64_to_cpu(req->port_select_mask[3]);
+ u32 counter_select = be32_to_cpu(req->counter_select_mask);
+ u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
+ unsigned long vl;
+
+ if ((nports != 1) || (portn != 1 << port)) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+ /*
+ * only counters returned by pma_get_opa_portstatus() are
+ * handled, so when pma_get_opa_portstatus() gets a fix,
+ * the corresponding change should be made here as well.
+ */
+
+ if (counter_select & CS_PORT_XMIT_DATA)
+ write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_RCV_DATA)
+ write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_XMIT_PKTS)
+ write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_RCV_PKTS)
+ write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
+ write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_MCAST_RCV_PKTS)
+ write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_XMIT_WAIT)
+ write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
+
+ /* ignore cs_sw_portCongestion for HFIs */
+
+ if (counter_select & CS_PORT_RCV_FECN)
+ write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_RCV_BECN)
+ write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
+
+ /* ignore cs_port_xmit_time_cong for HFIs */
+ /* ignore cs_port_xmit_wasted_bw for now */
+ /* ignore cs_port_xmit_wait_data for now */
+ if (counter_select & CS_PORT_RCV_BUBBLE)
+ write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
+
+ /* Only applicable for switch */
+ /* if (counter_select & CS_PORT_MARK_FECN)
+ * write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);
+ */
+
+ if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
+ write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+ /* ignore cs_port_rcv_switch_relay_errors for HFIs */
+ if (counter_select & CS_PORT_XMIT_DISCARDS)
+ write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
+ write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
+ write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS)
+ write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_LINK_ERROR_RECOVERY) {
+ write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+ write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+ CNTR_INVALID_VL, 0);
+ }
+
+ if (counter_select & CS_PORT_RCV_ERRORS)
+ write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
+ write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+ dd->rcv_ovfl_cnt = 0;
+ }
+
+ if (counter_select & CS_FM_CONFIG_ERRORS)
+ write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_LINK_DOWNED)
+ write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
+
+ if (counter_select & CS_UNCORRECTABLE_ERRORS)
+ write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
+
+ for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+ 8 * sizeof(vl_select_mask)) {
+ if (counter_select & CS_PORT_XMIT_DATA)
+ write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_RCV_DATA)
+ write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_XMIT_PKTS)
+ write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_RCV_PKTS)
+ write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_XMIT_WAIT)
+ write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
+
+ /* sw_port_vl_congestion is 0 for HFIs */
+ if (counter_select & CS_PORT_RCV_FECN)
+ write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
+
+ if (counter_select & CS_PORT_RCV_BECN)
+ write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
+
+ /* port_vl_xmit_time_cong is 0 for HFIs */
+ /* port_vl_xmit_wasted_bw ??? */
+ /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
+ if (counter_select & CS_PORT_RCV_BUBBLE)
+ write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
+
+ /* if (counter_select & CS_PORT_MARK_FECN)
+ * write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
+ */
+ if (counter_select & C_SW_XMIT_DSCD_VL)
+ write_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+ idx_from_vl(vl), 0);
+ }
+
+ if (resp_len)
+ *resp_len += sizeof(*req);
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
+ struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ struct _port_ei *rsp;
+ struct opa_port_error_info_msg *req;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ u64 port_mask;
+ u32 num_ports;
+ u8 port_num;
+ u8 num_pslm;
+ u32 error_info_select;
+
+ req = (struct opa_port_error_info_msg *)pmp->data;
+ rsp = &req->port[0];
+
+ num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+ num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+ memset(rsp, 0, sizeof(*rsp));
+
+ if (num_ports != 1 || num_ports != num_pslm) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ /*
+ * The bit set in the mask needs to be consistent with the port
+ * the request came in on.
+ */
+ port_mask = be64_to_cpu(req->port_select_mask[3]);
+ port_num = find_first_bit((unsigned long *)&port_mask,
+ sizeof(port_mask) * 8);
+
+ if (port_num != port) {
+ pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ error_info_select = be32_to_cpu(req->error_info_select_mask);
+
+ /* PortRcvErrorInfo */
+ if (error_info_select & ES_PORT_RCV_ERROR_INFO)
+ /* turn off status bit */
+ dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+
+ /* ExcessiverBufferOverrunInfo */
+ if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
+ /*
+ * status bit is essentially kept in the h/w - bit 5 of
+ * RCV_ERR_INFO
+ */
+ write_csr(dd, RCV_ERR_INFO,
+ RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+
+ if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
+ dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+ if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
+ dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+ /* UncorrectableErrorInfo */
+ if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
+ /* turn off status bit */
+ dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
+
+ /* FMConfigErrorInfo */
+ if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
+ /* turn off status bit */
+ dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
+
+ if (resp_len)
+ *resp_len += sizeof(*req);
+
+ return reply((struct ib_mad_hdr *)pmp);
+}
+
+struct opa_congestion_info_attr {
+ __be16 congestion_info;
+ u8 control_table_cap; /* Multiple of 64 entry unit CCTs */
+ u8 congestion_log_length;
+} __packed;
+
+static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct opa_congestion_info_attr *p =
+ (struct opa_congestion_info_attr *)data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ p->congestion_info = 0;
+ p->control_table_cap = ppd->cc_max_table_entries;
+ p->congestion_log_length = OPA_CONG_LOG_ELEMS;
+
+ if (resp_len)
+ *resp_len += sizeof(*p);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
+ u8 *data, struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ int i;
+ struct opa_congestion_setting_attr *p =
+ (struct opa_congestion_setting_attr *)data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct opa_congestion_setting_entry_shadow *entries;
+ struct cc_state *cc_state;
+
+ rcu_read_lock();
+
+ cc_state = get_cc_state(ppd);
+
+ if (!cc_state) {
+ rcu_read_unlock();
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ entries = cc_state->cong_setting.entries;
+ p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
+ p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
+ for (i = 0; i < OPA_MAX_SLS; i++) {
+ p->entries[i].ccti_increase = entries[i].ccti_increase;
+ p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+ p->entries[i].trigger_threshold =
+ entries[i].trigger_threshold;
+ p->entries[i].ccti_min = entries[i].ccti_min;
+ }
+
+ rcu_read_unlock();
+
+ if (resp_len)
+ *resp_len += sizeof(*p);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * Apply congestion control information stored in the ppd to the
+ * active structure.
+ */
+static void apply_cc_state(struct hfi1_pportdata *ppd)
+{
+ struct cc_state *old_cc_state, *new_cc_state;
+
+ new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
+ if (!new_cc_state)
+ return;
+
+ /*
+ * Hold the lock for updating *and* to prevent ppd information
+ * from changing during the update.
+ */
+ spin_lock(&ppd->cc_state_lock);
+
+ old_cc_state = get_cc_state_protected(ppd);
+ if (!old_cc_state) {
+ /* never active, or shutting down */
+ spin_unlock(&ppd->cc_state_lock);
+ kfree(new_cc_state);
+ return;
+ }
+
+ *new_cc_state = *old_cc_state;
+
+ new_cc_state->cct.ccti_limit = ppd->total_cct_entry - 1;
+ memcpy(new_cc_state->cct.entries, ppd->ccti_entries,
+ ppd->total_cct_entry * sizeof(struct ib_cc_table_entry));
+
+ new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
+ new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
+ memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
+ OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
+
+ rcu_assign_pointer(ppd->cc_state, new_cc_state);
+
+ spin_unlock(&ppd->cc_state_lock);
+
+ kfree_rcu(old_cc_state, rcu);
+}
+
+static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct opa_congestion_setting_attr *p =
+ (struct opa_congestion_setting_attr *)data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct opa_congestion_setting_entry_shadow *entries;
+ int i;
+
+ /*
+ * Save details from packet into the ppd. Hold the cc_state_lock so
+ * our information is consistent with anyone trying to apply the state.
+ */
+ spin_lock(&ppd->cc_state_lock);
+ ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
+
+ entries = ppd->congestion_entries;
+ for (i = 0; i < OPA_MAX_SLS; i++) {
+ entries[i].ccti_increase = p->entries[i].ccti_increase;
+ entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
+ entries[i].trigger_threshold =
+ p->entries[i].trigger_threshold;
+ entries[i].ccti_min = p->entries[i].ccti_min;
+ }
+ spin_unlock(&ppd->cc_state_lock);
+
+ /* now apply the information */
+ apply_cc_state(ppd);
+
+ return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
+ resp_len);
+}
+
+static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
+ u8 *data, struct ib_device *ibdev,
+ u8 port, u32 *resp_len)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
+ s64 ts;
+ int i;
+
+ if (am != 0) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ spin_lock_irq(&ppd->cc_log_lock);
+
+ cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
+ cong_log->congestion_flags = 0;
+ cong_log->threshold_event_counter =
+ cpu_to_be16(ppd->threshold_event_counter);
+ memcpy(cong_log->threshold_cong_event_map,
+ ppd->threshold_cong_event_map,
+ sizeof(cong_log->threshold_cong_event_map));
+ /* keep timestamp in units of 1.024 usec */
+ ts = ktime_to_ns(ktime_get()) / 1024;
+ cong_log->current_time_stamp = cpu_to_be32(ts);
+ for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
+ struct opa_hfi1_cong_log_event_internal *cce =
+ &ppd->cc_events[ppd->cc_mad_idx++];
+ if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
+ ppd->cc_mad_idx = 0;
+ /*
+ * Entries which are older than twice the time
+ * required to wrap the counter are supposed to
+ * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
+ */
+ if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX))
+ continue;
+ memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
+ memcpy(cong_log->events[i].remote_qp_number_cn_entry,
+ &cce->rqpn, 3);
+ cong_log->events[i].sl_svc_type_cn_entry =
+ ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
+ cong_log->events[i].remote_lid_cn_entry =
+ cpu_to_be32(cce->rlid);
+ cong_log->events[i].timestamp_cn_entry =
+ cpu_to_be32(cce->timestamp);
+ }
+
+ /*
+ * Reset threshold_cong_event_map, and threshold_event_counter
+ * to 0 when log is read.
+ */
+ memset(ppd->threshold_cong_event_map, 0x0,
+ sizeof(ppd->threshold_cong_event_map));
+ ppd->threshold_event_counter = 0;
+
+ spin_unlock_irq(&ppd->cc_log_lock);
+
+ if (resp_len)
+ *resp_len += sizeof(struct opa_hfi1_cong_log);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct ib_cc_table_attr *cc_table_attr =
+ (struct ib_cc_table_attr *)data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 start_block = OPA_AM_START_BLK(am);
+ u32 n_blocks = OPA_AM_NBLK(am);
+ struct ib_cc_table_entry_shadow *entries;
+ int i, j;
+ u32 sentry, eentry;
+ struct cc_state *cc_state;
+
+ /* sanity check n_blocks, start_block */
+ if (n_blocks == 0 ||
+ start_block + n_blocks > ppd->cc_max_table_entries) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ rcu_read_lock();
+
+ cc_state = get_cc_state(ppd);
+
+ if (!cc_state) {
+ rcu_read_unlock();
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ sentry = start_block * IB_CCT_ENTRIES;
+ eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
+
+ cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
+
+ entries = cc_state->cct.entries;
+
+ /* return n_blocks, though the last block may not be full */
+ for (j = 0, i = sentry; i < eentry; j++, i++)
+ cc_table_attr->ccti_entries[j].entry =
+ cpu_to_be16(entries[i].entry);
+
+ rcu_read_unlock();
+
+ if (resp_len)
+ *resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u32 start_block = OPA_AM_START_BLK(am);
+ u32 n_blocks = OPA_AM_NBLK(am);
+ struct ib_cc_table_entry_shadow *entries;
+ int i, j;
+ u32 sentry, eentry;
+ u16 ccti_limit;
+
+ /* sanity check n_blocks, start_block */
+ if (n_blocks == 0 ||
+ start_block + n_blocks > ppd->cc_max_table_entries) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ sentry = start_block * IB_CCT_ENTRIES;
+ eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
+ (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
+
+ /* sanity check ccti_limit */
+ ccti_limit = be16_to_cpu(p->ccti_limit);
+ if (ccti_limit + 1 > eentry) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /*
+ * Save details from packet into the ppd. Hold the cc_state_lock so
+ * our information is consistent with anyone trying to apply the state.
+ */
+ spin_lock(&ppd->cc_state_lock);
+ ppd->total_cct_entry = ccti_limit + 1;
+ entries = ppd->ccti_entries;
+ for (j = 0, i = sentry; i < eentry; j++, i++)
+ entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
+ spin_unlock(&ppd->cc_state_lock);
+
+ /* now apply the information */
+ apply_cc_state(ppd);
+
+ return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_led_info {
+ __be32 rsvd_led_mask;
+ __be32 rsvd;
+};
+
+#define OPA_LED_SHIFT 31
+#define OPA_LED_MASK BIT(OPA_LED_SHIFT)
+
+static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct hfi1_pportdata *ppd = dd->pport;
+ struct opa_led_info *p = (struct opa_led_info *)data;
+ u32 nport = OPA_AM_NPORT(am);
+ u32 is_beaconing_active;
+
+ if (nport != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /*
+ * This pairs with the memory barrier in hfi1_start_led_override to
+ * ensure that we read the correct state of LED beaconing represented
+ * by led_override_timer_active
+ */
+ smp_rmb();
+ is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active);
+ p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT);
+
+ if (resp_len)
+ *resp_len += sizeof(struct opa_led_info);
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ struct opa_led_info *p = (struct opa_led_info *)data;
+ u32 nport = OPA_AM_NPORT(am);
+ int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
+
+ if (nport != 1) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ if (on)
+ hfi1_start_led_override(dd->pport, 2000, 1500);
+ else
+ shutdown_led_override(dd->pport);
+
+ return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
+}
+
+static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+ u8 *data, struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int ret;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+ switch (attr_id) {
+ case IB_SMP_ATTR_NODE_DESC:
+ ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_NODE_INFO:
+ ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_PORT_INFO:
+ ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_PKEY_TABLE:
+ ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+ ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+ ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+ ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+ ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_PORT_STATE_INFO:
+ ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+ ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_CABLE_INFO:
+ ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_VL_ARB_TABLE:
+ ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_CONGESTION_INFO:
+ ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+ ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
+ port, resp_len);
+ break;
+ case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
+ ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
+ port, resp_len);
+ break;
+ case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+ ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_LED_INFO:
+ ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_SM_INFO:
+ if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+ return IB_MAD_RESULT_SUCCESS;
+ /* FALLTHROUGH */
+ default:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)smp);
+ break;
+ }
+ return ret;
+}
+
+static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+ u8 *data, struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int ret;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+ switch (attr_id) {
+ case IB_SMP_ATTR_PORT_INFO:
+ ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_PKEY_TABLE:
+ ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+ ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+ ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+ ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+ ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_PORT_STATE_INFO:
+ ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+ ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_VL_ARB_TABLE:
+ ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+ ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
+ port, resp_len);
+ break;
+ case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+ ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_LED_INFO:
+ ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
+ resp_len);
+ break;
+ case IB_SMP_ATTR_SM_INFO:
+ if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED)
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+ return IB_MAD_RESULT_SUCCESS;
+ /* FALLTHROUGH */
+ default:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)smp);
+ break;
+ }
+ return ret;
+}
+
+static inline void set_aggr_error(struct opa_aggregate *ag)
+{
+ ag->err_reqlength |= cpu_to_be16(0x8000);
+}
+
+static int subn_get_opa_aggregate(struct opa_smp *smp,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int i;
+ u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+ u8 *next_smp = opa_get_smp_data(smp);
+
+ if (num_attr < 1 || num_attr > 117) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < num_attr; i++) {
+ struct opa_aggregate *agg;
+ size_t agg_data_len;
+ size_t agg_size;
+ u32 am;
+
+ agg = (struct opa_aggregate *)next_smp;
+ agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+ agg_size = sizeof(*agg) + agg_data_len;
+ am = be32_to_cpu(agg->attr_mod);
+
+ *resp_len += agg_size;
+
+ if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ /* zero the payload for this segment */
+ memset(next_smp + sizeof(*agg), 0, agg_data_len);
+
+ (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
+ ibdev, port, NULL);
+ if (smp->status & ~IB_SMP_DIRECTION) {
+ set_aggr_error(agg);
+ return reply((struct ib_mad_hdr *)smp);
+ }
+ next_smp += agg_size;
+ }
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_set_opa_aggregate(struct opa_smp *smp,
+ struct ib_device *ibdev, u8 port,
+ u32 *resp_len)
+{
+ int i;
+ u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+ u8 *next_smp = opa_get_smp_data(smp);
+
+ if (num_attr < 1 || num_attr > 117) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ for (i = 0; i < num_attr; i++) {
+ struct opa_aggregate *agg;
+ size_t agg_data_len;
+ size_t agg_size;
+ u32 am;
+
+ agg = (struct opa_aggregate *)next_smp;
+ agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+ agg_size = sizeof(*agg) + agg_data_len;
+ am = be32_to_cpu(agg->attr_mod);
+
+ *resp_len += agg_size;
+
+ if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+ smp->status |= IB_SMP_INVALID_FIELD;
+ return reply((struct ib_mad_hdr *)smp);
+ }
+
+ (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
+ ibdev, port, NULL);
+ if (smp->status & ~IB_SMP_DIRECTION) {
+ set_aggr_error(agg);
+ return reply((struct ib_mad_hdr *)smp);
+ }
+ next_smp += agg_size;
+ }
+
+ return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * OPAv1 specifies that, on the transition to link up, these counters
+ * are cleared:
+ * PortRcvErrors [*]
+ * LinkErrorRecovery
+ * LocalLinkIntegrityErrors
+ * ExcessiveBufferOverruns [*]
+ *
+ * [*] Error info associated with these counters is retained, but the
+ * error info status is reset to 0.
+ */
+void clear_linkup_counters(struct hfi1_devdata *dd)
+{
+ /* PortRcvErrors */
+ write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+ dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+ /* LinkErrorRecovery */
+ write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+ write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
+ /* LocalLinkIntegrityErrors */
+ write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+ /* ExcessiveBufferOverruns */
+ write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+ dd->rcv_ovfl_cnt = 0;
+ dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+}
+
+/*
+ * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
+ * local node, 0 otherwise.
+ */
+static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
+ const struct ib_wc *in_wc)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ const struct opa_smp *smp = (const struct opa_smp *)mad;
+
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ return (smp->hop_cnt == 0 &&
+ smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
+ smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
+ }
+
+ return (in_wc->slid == ppd->lid);
+}
+
+/*
+ * opa_local_smp_check() should only be called on MADs for which
+ * is_local_mad() returns true. It applies the SMP checks that are
+ * specific to SMPs which are sent from, and destined to this node.
+ * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
+ * otherwise.
+ *
+ * SMPs which arrive from other nodes are instead checked by
+ * opa_smp_check().
+ */
+static int opa_local_smp_check(struct hfi1_ibport *ibp,
+ const struct ib_wc *in_wc)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u16 slid = in_wc->slid;
+ u16 pkey;
+
+ if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
+ return 1;
+
+ pkey = ppd->pkeys[in_wc->pkey_index];
+ /*
+ * We need to do the "node-local" checks specified in OPAv1,
+ * rev 0.90, section 9.10.26, which are:
+ * - pkey is 0x7fff, or 0xffff
+ * - Source QPN == 0 || Destination QPN == 0
+ * - the MAD header's management class is either
+ * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
+ * IB_MGMT_CLASS_SUBN_LID_ROUTED
+ * - SLID != 0
+ *
+ * However, we know (and so don't need to check again) that,
+ * for local SMPs, the MAD stack passes MADs with:
+ * - Source QPN of 0
+ * - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
+ * our own port's lid
+ *
+ */
+ if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
+ return 0;
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+}
+
+static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
+ u8 port, const struct opa_mad *in_mad,
+ struct opa_mad *out_mad,
+ u32 *resp_len)
+{
+ struct opa_smp *smp = (struct opa_smp *)out_mad;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ u8 *data;
+ u32 am;
+ __be16 attr_id;
+ int ret;
+
+ *out_mad = *in_mad;
+ data = opa_get_smp_data(smp);
+
+ am = be32_to_cpu(smp->attr_mod);
+ attr_id = smp->attr_id;
+ if (smp->class_version != OPA_SMI_CLASS_VERSION) {
+ smp->status |= IB_SMP_UNSUP_VERSION;
+ ret = reply((struct ib_mad_hdr *)smp);
+ return ret;
+ }
+ ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
+ smp->route.dr.dr_slid, smp->route.dr.return_path,
+ smp->hop_cnt);
+ if (ret) {
+ u32 port_num = be32_to_cpu(smp->attr_mod);
+
+ /*
+ * If this is a get/set portinfo, we already check the
+ * M_Key if the MAD is for another port and the M_Key
+ * is OK on the receiving port. This check is needed
+ * to increment the error counters when the M_Key
+ * fails to match on *both* ports.
+ */
+ if (attr_id == IB_SMP_ATTR_PORT_INFO &&
+ (smp->method == IB_MGMT_METHOD_GET ||
+ smp->method == IB_MGMT_METHOD_SET) &&
+ port_num && port_num <= ibdev->phys_port_cnt &&
+ port != port_num)
+ (void)check_mkey(to_iport(ibdev, port_num),
+ (struct ib_mad_hdr *)smp, 0,
+ smp->mkey, smp->route.dr.dr_slid,
+ smp->route.dr.return_path,
+ smp->hop_cnt);
+ ret = IB_MAD_RESULT_FAILURE;
+ return ret;
+ }
+
+ *resp_len = opa_get_smp_header_size(smp);
+
+ switch (smp->method) {
+ case IB_MGMT_METHOD_GET:
+ switch (attr_id) {
+ default:
+ clear_opa_smp_data(smp);
+ ret = subn_get_opa_sma(attr_id, smp, am, data,
+ ibdev, port, resp_len);
+ break;
+ case OPA_ATTRIB_ID_AGGREGATE:
+ ret = subn_get_opa_aggregate(smp, ibdev, port,
+ resp_len);
+ break;
+ }
+ break;
+ case IB_MGMT_METHOD_SET:
+ switch (attr_id) {
+ default:
+ ret = subn_set_opa_sma(attr_id, smp, am, data,
+ ibdev, port, resp_len);
+ break;
+ case OPA_ATTRIB_ID_AGGREGATE:
+ ret = subn_set_opa_aggregate(smp, ibdev, port,
+ resp_len);
+ break;
+ }
+ break;
+ case IB_MGMT_METHOD_TRAP:
+ case IB_MGMT_METHOD_REPORT:
+ case IB_MGMT_METHOD_REPORT_RESP:
+ case IB_MGMT_METHOD_GET_RESP:
+ /*
+ * The ib_mad module will call us to process responses
+ * before checking for other consumers.
+ * Just tell the caller to process it normally.
+ */
+ ret = IB_MAD_RESULT_SUCCESS;
+ break;
+ default:
+ smp->status |= IB_SMP_UNSUP_METHOD;
+ ret = reply((struct ib_mad_hdr *)smp);
+ break;
+ }
+
+ return ret;
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+ u8 port, const struct ib_mad *in_mad,
+ struct ib_mad *out_mad)
+{
+ struct ib_smp *smp = (struct ib_smp *)out_mad;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+ int ret;
+
+ *out_mad = *in_mad;
+ if (smp->class_version != 1) {
+ smp->status |= IB_SMP_UNSUP_VERSION;
+ ret = reply((struct ib_mad_hdr *)smp);
+ return ret;
+ }
+
+ ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
+ smp->mkey, (__force __be32)smp->dr_slid,
+ smp->return_path, smp->hop_cnt);
+ if (ret) {
+ u32 port_num = be32_to_cpu(smp->attr_mod);
+
+ /*
+ * If this is a get/set portinfo, we already check the
+ * M_Key if the MAD is for another port and the M_Key
+ * is OK on the receiving port. This check is needed
+ * to increment the error counters when the M_Key
+ * fails to match on *both* ports.
+ */
+ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+ (smp->method == IB_MGMT_METHOD_GET ||
+ smp->method == IB_MGMT_METHOD_SET) &&
+ port_num && port_num <= ibdev->phys_port_cnt &&
+ port != port_num)
+ (void)check_mkey(to_iport(ibdev, port_num),
+ (struct ib_mad_hdr *)smp, 0,
+ smp->mkey,
+ (__force __be32)smp->dr_slid,
+ smp->return_path, smp->hop_cnt);
+ ret = IB_MAD_RESULT_FAILURE;
+ return ret;
+ }
+
+ switch (smp->method) {
+ case IB_MGMT_METHOD_GET:
+ switch (smp->attr_id) {
+ case IB_SMP_ATTR_NODE_INFO:
+ ret = subn_get_nodeinfo(smp, ibdev, port);
+ break;
+ default:
+ smp->status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)smp);
+ break;
+ }
+ break;
+ }
+
+ return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port,
+ const struct ib_mad *in_mad,
+ struct ib_mad *out_mad)
+{
+ struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+ struct ib_class_port_info *cpi = (struct ib_class_port_info *)
+ &pmp->data;
+ int ret = IB_MAD_RESULT_FAILURE;
+
+ *out_mad = *in_mad;
+ if (pmp->mad_hdr.class_version != 1) {
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ return ret;
+ }
+
+ switch (pmp->mad_hdr.method) {
+ case IB_MGMT_METHOD_GET:
+ switch (pmp->mad_hdr.attr_id) {
+ case IB_PMA_PORT_COUNTERS:
+ ret = pma_get_ib_portcounters(pmp, ibdev, port);
+ break;
+ case IB_PMA_PORT_COUNTERS_EXT:
+ ret = pma_get_ib_portcounters_ext(pmp, ibdev, port);
+ break;
+ case IB_PMA_CLASS_PORT_INFO:
+ cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ break;
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ break;
+ }
+ break;
+
+ case IB_MGMT_METHOD_SET:
+ if (pmp->mad_hdr.attr_id) {
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ }
+ break;
+
+ case IB_MGMT_METHOD_TRAP:
+ case IB_MGMT_METHOD_GET_RESP:
+ /*
+ * The ib_mad module will call us to process responses
+ * before checking for other consumers.
+ * Just tell the caller to process it normally.
+ */
+ ret = IB_MAD_RESULT_SUCCESS;
+ break;
+
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ break;
+ }
+
+ return ret;
+}
+
+static int process_perf_opa(struct ib_device *ibdev, u8 port,
+ const struct opa_mad *in_mad,
+ struct opa_mad *out_mad, u32 *resp_len)
+{
+ struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
+ int ret;
+
+ *out_mad = *in_mad;
+
+ if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) {
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+ return reply((struct ib_mad_hdr *)pmp);
+ }
+
+ *resp_len = sizeof(pmp->mad_hdr);
+
+ switch (pmp->mad_hdr.method) {
+ case IB_MGMT_METHOD_GET:
+ switch (pmp->mad_hdr.attr_id) {
+ case IB_PMA_CLASS_PORT_INFO:
+ ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
+ break;
+ case OPA_PM_ATTRIB_ID_PORT_STATUS:
+ ret = pma_get_opa_portstatus(pmp, ibdev, port,
+ resp_len);
+ break;
+ case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
+ ret = pma_get_opa_datacounters(pmp, ibdev, port,
+ resp_len);
+ break;
+ case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
+ ret = pma_get_opa_porterrors(pmp, ibdev, port,
+ resp_len);
+ break;
+ case OPA_PM_ATTRIB_ID_ERROR_INFO:
+ ret = pma_get_opa_errorinfo(pmp, ibdev, port,
+ resp_len);
+ break;
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ break;
+ }
+ break;
+
+ case IB_MGMT_METHOD_SET:
+ switch (pmp->mad_hdr.attr_id) {
+ case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
+ ret = pma_set_opa_portstatus(pmp, ibdev, port,
+ resp_len);
+ break;
+ case OPA_PM_ATTRIB_ID_ERROR_INFO:
+ ret = pma_set_opa_errorinfo(pmp, ibdev, port,
+ resp_len);
+ break;
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ break;
+ }
+ break;
+
+ case IB_MGMT_METHOD_TRAP:
+ case IB_MGMT_METHOD_GET_RESP:
+ /*
+ * The ib_mad module will call us to process responses
+ * before checking for other consumers.
+ * Just tell the caller to process it normally.
+ */
+ ret = IB_MAD_RESULT_SUCCESS;
+ break;
+
+ default:
+ pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+ ret = reply((struct ib_mad_hdr *)pmp);
+ break;
+ }
+
+ return ret;
+}
+
+static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
+ u8 port, const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const struct opa_mad *in_mad,
+ struct opa_mad *out_mad, size_t *out_mad_size,
+ u16 *out_mad_pkey_index)
+{
+ int ret;
+ int pkey_idx;
+ u32 resp_len = 0;
+ struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+ pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+ if (pkey_idx < 0) {
+ pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
+ hfi1_get_pkey(ibp, 1));
+ pkey_idx = 1;
+ }
+ *out_mad_pkey_index = (u16)pkey_idx;
+
+ switch (in_mad->mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ if (is_local_mad(ibp, in_mad, in_wc)) {
+ ret = opa_local_smp_check(ibp, in_wc);
+ if (ret)
+ return IB_MAD_RESULT_FAILURE;
+ }
+ ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
+ out_mad, &resp_len);
+ goto bail;
+ case IB_MGMT_CLASS_PERF_MGMT:
+ ret = process_perf_opa(ibdev, port, in_mad, out_mad,
+ &resp_len);
+ goto bail;
+
+ default:
+ ret = IB_MAD_RESULT_SUCCESS;
+ }
+
+bail:
+ if (ret & IB_MAD_RESULT_REPLY)
+ *out_mad_size = round_up(resp_len, 8);
+ else if (ret & IB_MAD_RESULT_SUCCESS)
+ *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
+
+ return ret;
+}
+
+static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+ const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const struct ib_mad *in_mad,
+ struct ib_mad *out_mad)
+{
+ int ret;
+
+ switch (in_mad->mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+ break;
+ case IB_MGMT_CLASS_PERF_MGMT:
+ ret = process_perf(ibdev, port, in_mad, out_mad);
+ break;
+ default:
+ ret = IB_MAD_RESULT_SUCCESS;
+ break;
+ }
+
+ return ret;
+}
+
+/**
+ * hfi1_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+ struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+ u16 *out_mad_pkey_index)
+{
+ switch (in_mad->base_version) {
+ case OPA_MGMT_BASE_VERSION:
+ if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
+ dev_err(ibdev->dma_device, "invalid in_mad_size\n");
+ return IB_MAD_RESULT_FAILURE;
+ }
+ return hfi1_process_opa_mad(ibdev, mad_flags, port,
+ in_wc, in_grh,
+ (struct opa_mad *)in_mad,
+ (struct opa_mad *)out_mad,
+ out_mad_size,
+ out_mad_pkey_index);
+ case IB_MGMT_BASE_VERSION:
+ return hfi1_process_ib_mad(ibdev, mad_flags, port,
+ in_wc, in_grh,
+ (const struct ib_mad *)in_mad,
+ (struct ib_mad *)out_mad);
+ default:
+ break;
+ }
+
+ return IB_MAD_RESULT_FAILURE;
+}
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
new file mode 100644
index 000000000000..5aa3fd1be653
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/mad.h
@@ -0,0 +1,432 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MAD_H
+#define _HFI1_MAD_H
+
+#include <rdma/ib_pma.h>
+#include <rdma/opa_smi.h>
+#include <rdma/opa_port_info.h>
+#include "opa_compat.h"
+
+/*
+ * OPA Traps
+ */
+#define OPA_TRAP_GID_NOW_IN_SERVICE cpu_to_be16(64)
+#define OPA_TRAP_GID_OUT_OF_SERVICE cpu_to_be16(65)
+#define OPA_TRAP_ADD_MULTICAST_GROUP cpu_to_be16(66)
+#define OPA_TRAL_DEL_MULTICAST_GROUP cpu_to_be16(67)
+#define OPA_TRAP_UNPATH cpu_to_be16(68)
+#define OPA_TRAP_REPATH cpu_to_be16(69)
+#define OPA_TRAP_PORT_CHANGE_STATE cpu_to_be16(128)
+#define OPA_TRAP_LINK_INTEGRITY cpu_to_be16(129)
+#define OPA_TRAP_EXCESSIVE_BUFFER_OVERRUN cpu_to_be16(130)
+#define OPA_TRAP_FLOW_WATCHDOG cpu_to_be16(131)
+#define OPA_TRAP_CHANGE_CAPABILITY cpu_to_be16(144)
+#define OPA_TRAP_CHANGE_SYSGUID cpu_to_be16(145)
+#define OPA_TRAP_BAD_M_KEY cpu_to_be16(256)
+#define OPA_TRAP_BAD_P_KEY cpu_to_be16(257)
+#define OPA_TRAP_BAD_Q_KEY cpu_to_be16(258)
+#define OPA_TRAP_SWITCH_BAD_PKEY cpu_to_be16(259)
+#define OPA_SMA_TRAP_DATA_LINK_WIDTH cpu_to_be16(2048)
+
+/*
+ * Generic trap/notice other local changes flags (trap 144).
+ */
+#define OPA_NOTICE_TRAP_LWDE_CHG 0x08 /* Link Width Downgrade Enable
+ * changed
+ */
+#define OPA_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */
+#define OPA_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */
+#define OPA_NOTICE_TRAP_NODE_DESC_CHG 0x01
+
+struct opa_mad_notice_attr {
+ u8 generic_type;
+ u8 prod_type_msb;
+ __be16 prod_type_lsb;
+ __be16 trap_num;
+ __be16 toggle_count;
+ __be32 issuer_lid;
+ __be32 reserved1;
+ union ib_gid issuer_gid;
+
+ union {
+ struct {
+ u8 details[64];
+ } raw_data;
+
+ struct {
+ union ib_gid gid;
+ } __packed ntc_64_65_66_67;
+
+ struct {
+ __be32 lid;
+ } __packed ntc_128;
+
+ struct {
+ __be32 lid; /* where violation happened */
+ u8 port_num; /* where violation happened */
+ } __packed ntc_129_130_131;
+
+ struct {
+ __be32 lid; /* LID where change occurred */
+ __be32 new_cap_mask; /* new capability mask */
+ __be16 reserved2;
+ __be16 cap_mask;
+ __be16 change_flags; /* low 4 bits only */
+ } __packed ntc_144;
+
+ struct {
+ __be64 new_sys_guid;
+ __be32 lid; /* lid where sys guid changed */
+ } __packed ntc_145;
+
+ struct {
+ __be32 lid;
+ __be32 dr_slid;
+ u8 method;
+ u8 dr_trunc_hop;
+ __be16 attr_id;
+ __be32 attr_mod;
+ __be64 mkey;
+ u8 dr_rtn_path[30];
+ } __packed ntc_256;
+
+ struct {
+ __be32 lid1;
+ __be32 lid2;
+ __be32 key;
+ u8 sl; /* SL: high 5 bits */
+ u8 reserved3[3];
+ union ib_gid gid1;
+ union ib_gid gid2;
+ __be32 qp1; /* high 8 bits reserved */
+ __be32 qp2; /* high 8 bits reserved */
+ } __packed ntc_257_258;
+
+ struct {
+ __be16 flags; /* low 8 bits reserved */
+ __be16 pkey;
+ __be32 lid1;
+ __be32 lid2;
+ u8 sl; /* SL: high 5 bits */
+ u8 reserved4[3];
+ union ib_gid gid1;
+ union ib_gid gid2;
+ __be32 qp1; /* high 8 bits reserved */
+ __be32 qp2; /* high 8 bits reserved */
+ } __packed ntc_259;
+
+ struct {
+ __be32 lid;
+ } __packed ntc_2048;
+
+ };
+ u8 class_data[0];
+};
+
+#define IB_VLARB_LOWPRI_0_31 1
+#define IB_VLARB_LOWPRI_32_63 2
+#define IB_VLARB_HIGHPRI_0_31 3
+#define IB_VLARB_HIGHPRI_32_63 4
+
+#define OPA_MAX_PREEMPT_CAP 32
+#define OPA_VLARB_LOW_ELEMENTS 0
+#define OPA_VLARB_HIGH_ELEMENTS 1
+#define OPA_VLARB_PREEMPT_ELEMENTS 2
+#define OPA_VLARB_PREEMPT_MATRIX 3
+
+#define IB_PMA_PORT_COUNTERS_CONG cpu_to_be16(0xFF00)
+
+struct ib_pma_portcounters_cong {
+ u8 reserved;
+ u8 reserved1;
+ __be16 port_check_rate;
+ __be16 symbol_error_counter;
+ u8 link_error_recovery_counter;
+ u8 link_downed_counter;
+ __be16 port_rcv_errors;
+ __be16 port_rcv_remphys_errors;
+ __be16 port_rcv_switch_relay_errors;
+ __be16 port_xmit_discards;
+ u8 port_xmit_constraint_errors;
+ u8 port_rcv_constraint_errors;
+ u8 reserved2;
+ u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+ __be16 reserved3;
+ __be16 vl15_dropped;
+ __be64 port_xmit_data;
+ __be64 port_rcv_data;
+ __be64 port_xmit_packets;
+ __be64 port_rcv_packets;
+ __be64 port_xmit_wait;
+ __be64 port_adr_events;
+} __packed;
+
+#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C)
+
+#define OPA_MAX_PREEMPT_CAP 32
+#define OPA_VLARB_LOW_ELEMENTS 0
+#define OPA_VLARB_HIGH_ELEMENTS 1
+#define OPA_VLARB_PREEMPT_ELEMENTS 2
+#define OPA_VLARB_PREEMPT_MATRIX 3
+
+#define HFI1_XMIT_RATE_UNSUPPORTED 0x0
+#define HFI1_XMIT_RATE_PICO 0x7
+/* number of 4nsec cycles equaling 2secs */
+#define HFI1_CONG_TIMER_PSINTERVAL 0x1DCD64EC
+
+#define IB_CC_SVCTYPE_RC 0x0
+#define IB_CC_SVCTYPE_UC 0x1
+#define IB_CC_SVCTYPE_RD 0x2
+#define IB_CC_SVCTYPE_UD 0x3
+
+/*
+ * There should be an equivalent IB #define for the following, but
+ * I cannot find it.
+ */
+#define OPA_CC_LOG_TYPE_HFI 2
+
+struct opa_hfi1_cong_log_event_internal {
+ u32 lqpn;
+ u32 rqpn;
+ u8 sl;
+ u8 svc_type;
+ u32 rlid;
+ s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
+};
+
+struct opa_hfi1_cong_log_event {
+ u8 local_qp_cn_entry[3];
+ u8 remote_qp_number_cn_entry[3];
+ u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
+ u8 reserved;
+ __be32 remote_lid_cn_entry;
+ __be32 timestamp_cn_entry;
+} __packed;
+
+#define OPA_CONG_LOG_ELEMS 96
+
+struct opa_hfi1_cong_log {
+ u8 log_type;
+ u8 congestion_flags;
+ __be16 threshold_event_counter;
+ __be32 current_time_stamp;
+ u8 threshold_cong_event_map[OPA_MAX_SLS / 8];
+ struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
+} __packed;
+
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct opa_congestion_setting_entry {
+ u8 ccti_increase;
+ u8 reserved;
+ __be16 ccti_timer;
+ u8 trigger_threshold;
+ u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_entry_shadow {
+ u8 ccti_increase;
+ u8 reserved;
+ u16 ccti_timer;
+ u8 trigger_threshold;
+ u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_attr {
+ __be32 control_map;
+ __be16 port_control;
+ struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
+} __packed;
+
+struct opa_congestion_setting_attr_shadow {
+ u32 control_map;
+ u16 port_control;
+ struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+ __be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+ u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+ __be16 ccti_limit; /* max CCTI for cc table */
+ struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+ u16 ccti_limit; /* max CCTI for cc table */
+ struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+ (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+ u16 ccti_limit; /* max CCTI for cc table */
+ struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * struct cc_state combines the (active) per-port congestion control
+ * table, and the (active) per-SL congestion settings. cc_state data
+ * may need to be read in code paths that we want to be fast, so it
+ * is an RCU protected structure.
+ */
+struct cc_state {
+ struct rcu_head rcu;
+ struct cc_table_shadow cct;
+ struct opa_congestion_setting_attr_shadow cong_setting;
+};
+
+/*
+ * OPA BufferControl MAD
+ */
+
+/* attribute modifier macros */
+#define OPA_AM_NPORT_SHIFT 24
+#define OPA_AM_NPORT_MASK 0xff
+#define OPA_AM_NPORT_SMASK (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
+#define OPA_AM_NPORT(am) (((am) >> OPA_AM_NPORT_SHIFT) & \
+ OPA_AM_NPORT_MASK)
+
+#define OPA_AM_NBLK_SHIFT 24
+#define OPA_AM_NBLK_MASK 0xff
+#define OPA_AM_NBLK_SMASK (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
+#define OPA_AM_NBLK(am) (((am) >> OPA_AM_NBLK_SHIFT) & \
+ OPA_AM_NBLK_MASK)
+
+#define OPA_AM_START_BLK_SHIFT 0
+#define OPA_AM_START_BLK_MASK 0xff
+#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \
+ OPA_AM_START_BLK_SHIFT)
+#define OPA_AM_START_BLK(am) (((am) >> OPA_AM_START_BLK_SHIFT) & \
+ OPA_AM_START_BLK_MASK)
+
+#define OPA_AM_PORTNUM_SHIFT 0
+#define OPA_AM_PORTNUM_MASK 0xff
+#define OPA_AM_PORTNUM_SMASK (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
+#define OPA_AM_PORTNUM(am) (((am) >> OPA_AM_PORTNUM_SHIFT) & \
+ OPA_AM_PORTNUM_MASK)
+
+#define OPA_AM_ASYNC_SHIFT 12
+#define OPA_AM_ASYNC_MASK 0x1
+#define OPA_AM_ASYNC_SMASK (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
+#define OPA_AM_ASYNC(am) (((am) >> OPA_AM_ASYNC_SHIFT) & \
+ OPA_AM_ASYNC_MASK)
+
+#define OPA_AM_START_SM_CFG_SHIFT 9
+#define OPA_AM_START_SM_CFG_MASK 0x1
+#define OPA_AM_START_SM_CFG_SMASK (OPA_AM_START_SM_CFG_MASK << \
+ OPA_AM_START_SM_CFG_SHIFT)
+#define OPA_AM_START_SM_CFG(am) (((am) >> OPA_AM_START_SM_CFG_SHIFT) \
+ & OPA_AM_START_SM_CFG_MASK)
+
+#define OPA_AM_CI_ADDR_SHIFT 19
+#define OPA_AM_CI_ADDR_MASK 0xfff
+#define OPA_AM_CI_ADDR_SMASK (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
+#define OPA_AM_CI_ADDR(am) (((am) >> OPA_AM_CI_ADDR_SHIFT) & \
+ OPA_AM_CI_ADDR_MASK)
+
+#define OPA_AM_CI_LEN_SHIFT 13
+#define OPA_AM_CI_LEN_MASK 0x3f
+#define OPA_AM_CI_LEN_SMASK (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
+#define OPA_AM_CI_LEN(am) (((am) >> OPA_AM_CI_LEN_SHIFT) & \
+ OPA_AM_CI_LEN_MASK)
+
+/* error info macros */
+#define OPA_EI_STATUS_SMASK 0x80
+#define OPA_EI_CODE_SMASK 0x0f
+
+struct vl_limit {
+ __be16 dedicated;
+ __be16 shared;
+};
+
+struct buffer_control {
+ __be16 reserved;
+ __be16 overall_shared_limit;
+ struct vl_limit vl[OPA_MAX_VLS];
+};
+
+struct sc2vlnt {
+ u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
+};
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+ cpu_to_be32(COUNTER_MASK(1, 0) | \
+ COUNTER_MASK(1, 1) | \
+ COUNTER_MASK(1, 2) | \
+ COUNTER_MASK(1, 3) | \
+ COUNTER_MASK(1, 4))
+
+void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port);
+
+#endif /* _HFI1_MAD_H */
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
new file mode 100644
index 000000000000..7ad30898fc19
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/interval_tree_generic.h>
+
+#include "mmu_rb.h"
+#include "trace.h"
+
+struct mmu_rb_handler {
+ struct mmu_notifier mn;
+ struct rb_root root;
+ void *ops_arg;
+ spinlock_t lock; /* protect the RB tree */
+ struct mmu_rb_ops *ops;
+ struct mm_struct *mm;
+ struct list_head lru_list;
+ struct work_struct del_work;
+ struct list_head del_list;
+ struct workqueue_struct *wq;
+};
+
+static unsigned long mmu_node_start(struct mmu_rb_node *);
+static unsigned long mmu_node_last(struct mmu_rb_node *);
+static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
+ unsigned long);
+static inline void mmu_notifier_range_start(struct mmu_notifier *,
+ struct mm_struct *,
+ unsigned long, unsigned long);
+static void mmu_notifier_mem_invalidate(struct mmu_notifier *,
+ struct mm_struct *,
+ unsigned long, unsigned long);
+static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
+ unsigned long, unsigned long);
+static void do_remove(struct mmu_rb_handler *handler,
+ struct list_head *del_list);
+static void handle_remove(struct work_struct *work);
+
+static struct mmu_notifier_ops mn_opts = {
+ .invalidate_page = mmu_notifier_page,
+ .invalidate_range_start = mmu_notifier_range_start,
+};
+
+INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last,
+ mmu_node_start, mmu_node_last, static, __mmu_int_rb);
+
+static unsigned long mmu_node_start(struct mmu_rb_node *node)
+{
+ return node->addr & PAGE_MASK;
+}
+
+static unsigned long mmu_node_last(struct mmu_rb_node *node)
+{
+ return PAGE_ALIGN(node->addr + node->len) - 1;
+}
+
+int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+ struct mmu_rb_ops *ops,
+ struct workqueue_struct *wq,
+ struct mmu_rb_handler **handler)
+{
+ struct mmu_rb_handler *handlr;
+ int ret;
+
+ handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
+ if (!handlr)
+ return -ENOMEM;
+
+ handlr->root = RB_ROOT;
+ handlr->ops = ops;
+ handlr->ops_arg = ops_arg;
+ INIT_HLIST_NODE(&handlr->mn.hlist);
+ spin_lock_init(&handlr->lock);
+ handlr->mn.ops = &mn_opts;
+ handlr->mm = mm;
+ INIT_WORK(&handlr->del_work, handle_remove);
+ INIT_LIST_HEAD(&handlr->del_list);
+ INIT_LIST_HEAD(&handlr->lru_list);
+ handlr->wq = wq;
+
+ ret = mmu_notifier_register(&handlr->mn, handlr->mm);
+ if (ret) {
+ kfree(handlr);
+ return ret;
+ }
+
+ *handler = handlr;
+ return 0;
+}
+
+void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
+{
+ struct mmu_rb_node *rbnode;
+ struct rb_node *node;
+ unsigned long flags;
+ struct list_head del_list;
+
+ /* Unregister first so we don't get any more notifications. */
+ mmu_notifier_unregister(&handler->mn, handler->mm);
+
+ /*
+ * Make sure the wq delete handler is finished running. It will not
+ * be triggered once the mmu notifiers are unregistered above.
+ */
+ flush_work(&handler->del_work);
+
+ INIT_LIST_HEAD(&del_list);
+
+ spin_lock_irqsave(&handler->lock, flags);
+ while ((node = rb_first(&handler->root))) {
+ rbnode = rb_entry(node, struct mmu_rb_node, node);
+ rb_erase(node, &handler->root);
+ /* move from LRU list to delete list */
+ list_move(&rbnode->list, &del_list);
+ }
+ spin_unlock_irqrestore(&handler->lock, flags);
+
+ do_remove(handler, &del_list);
+
+ kfree(handler);
+}
+
+int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
+ struct mmu_rb_node *mnode)
+{
+ struct mmu_rb_node *node;
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&handler->lock, flags);
+ hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr,
+ mnode->len);
+ node = __mmu_rb_search(handler, mnode->addr, mnode->len);
+ if (node) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+ __mmu_int_rb_insert(mnode, &handler->root);
+ list_add(&mnode->list, &handler->lru_list);
+
+ ret = handler->ops->insert(handler->ops_arg, mnode);
+ if (ret) {
+ __mmu_int_rb_remove(mnode, &handler->root);
+ list_del(&mnode->list); /* remove from LRU list */
+ }
+unlock:
+ spin_unlock_irqrestore(&handler->lock, flags);
+ return ret;
+}
+
+/* Caller must hold handler lock */
+static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
+ unsigned long addr,
+ unsigned long len)
+{
+ struct mmu_rb_node *node = NULL;
+
+ hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len);
+ if (!handler->ops->filter) {
+ node = __mmu_int_rb_iter_first(&handler->root, addr,
+ (addr + len) - 1);
+ } else {
+ for (node = __mmu_int_rb_iter_first(&handler->root, addr,
+ (addr + len) - 1);
+ node;
+ node = __mmu_int_rb_iter_next(node, addr,
+ (addr + len) - 1)) {
+ if (handler->ops->filter(node, addr, len))
+ return node;
+ }
+ }
+ return node;
+}
+
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler,
+ unsigned long addr, unsigned long len)
+{
+ struct mmu_rb_node *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&handler->lock, flags);
+ node = __mmu_rb_search(handler, addr, len);
+ if (node) {
+ __mmu_int_rb_remove(node, &handler->root);
+ list_del(&node->list); /* remove from LRU list */
+ }
+ spin_unlock_irqrestore(&handler->lock, flags);
+
+ return node;
+}
+
+void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
+{
+ struct mmu_rb_node *rbnode, *ptr;
+ struct list_head del_list;
+ unsigned long flags;
+ bool stop = false;
+
+ INIT_LIST_HEAD(&del_list);
+
+ spin_lock_irqsave(&handler->lock, flags);
+ list_for_each_entry_safe_reverse(rbnode, ptr, &handler->lru_list,
+ list) {
+ if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg,
+ &stop)) {
+ __mmu_int_rb_remove(rbnode, &handler->root);
+ /* move from LRU list to delete list */
+ list_move(&rbnode->list, &del_list);
+ }
+ if (stop)
+ break;
+ }
+ spin_unlock_irqrestore(&handler->lock, flags);
+
+ while (!list_empty(&del_list)) {
+ rbnode = list_first_entry(&del_list, struct mmu_rb_node, list);
+ list_del(&rbnode->list);
+ handler->ops->remove(handler->ops_arg, rbnode);
+ }
+}
+
+/*
+ * It is up to the caller to ensure that this function does not race with the
+ * mmu invalidate notifier which may be calling the users remove callback on
+ * 'node'.
+ */
+void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
+ struct mmu_rb_node *node)
+{
+ unsigned long flags;
+
+ /* Validity of handler and node pointers has been checked by caller. */
+ hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr,
+ node->len);
+ spin_lock_irqsave(&handler->lock, flags);
+ __mmu_int_rb_remove(node, &handler->root);
+ list_del(&node->list); /* remove from LRU list */
+ spin_unlock_irqrestore(&handler->lock, flags);
+
+ handler->ops->remove(handler->ops_arg, node);
+}
+
+static inline void mmu_notifier_page(struct mmu_notifier *mn,
+ struct mm_struct *mm, unsigned long addr)
+{
+ mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE);
+}
+
+static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ mmu_notifier_mem_invalidate(mn, mm, start, end);
+}
+
+static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct mmu_rb_handler *handler =
+ container_of(mn, struct mmu_rb_handler, mn);
+ struct rb_root *root = &handler->root;
+ struct mmu_rb_node *node, *ptr = NULL;
+ unsigned long flags;
+ bool added = false;
+
+ spin_lock_irqsave(&handler->lock, flags);
+ for (node = __mmu_int_rb_iter_first(root, start, end - 1);
+ node; node = ptr) {
+ /* Guard against node removal. */
+ ptr = __mmu_int_rb_iter_next(node, start, end - 1);
+ hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
+ node->addr, node->len);
+ if (handler->ops->invalidate(handler->ops_arg, node)) {
+ __mmu_int_rb_remove(node, root);
+ /* move from LRU list to delete list */
+ list_move(&node->list, &handler->del_list);
+ added = true;
+ }
+ }
+ spin_unlock_irqrestore(&handler->lock, flags);
+
+ if (added)
+ queue_work(handler->wq, &handler->del_work);
+}
+
+/*
+ * Call the remove function for the given handler and the list. This
+ * is expected to be called with a delete list extracted from handler.
+ * The caller should not be holding the handler lock.
+ */
+static void do_remove(struct mmu_rb_handler *handler,
+ struct list_head *del_list)
+{
+ struct mmu_rb_node *node;
+
+ while (!list_empty(del_list)) {
+ node = list_first_entry(del_list, struct mmu_rb_node, list);
+ list_del(&node->list);
+ handler->ops->remove(handler->ops_arg, node);
+ }
+}
+
+/*
+ * Work queue function to remove all nodes that have been queued up to
+ * be removed. The key feature is that mm->mmap_sem is not being held
+ * and the remove callback can sleep while taking it, if needed.
+ */
+static void handle_remove(struct work_struct *work)
+{
+ struct mmu_rb_handler *handler = container_of(work,
+ struct mmu_rb_handler,
+ del_work);
+ struct list_head del_list;
+ unsigned long flags;
+
+ /* remove anything that is queued to get removed */
+ spin_lock_irqsave(&handler->lock, flags);
+ list_replace_init(&handler->del_list, &del_list);
+ spin_unlock_irqrestore(&handler->lock, flags);
+
+ do_remove(handler, &del_list);
+}
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h
new file mode 100644
index 000000000000..754f6ebf13fb
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MMU_RB_H
+#define _HFI1_MMU_RB_H
+
+#include "hfi.h"
+
+struct mmu_rb_node {
+ unsigned long addr;
+ unsigned long len;
+ unsigned long __last;
+ struct rb_node node;
+ struct list_head list;
+};
+
+/*
+ * NOTE: filter, insert, invalidate, and evict must not sleep. Only remove is
+ * allowed to sleep.
+ */
+struct mmu_rb_ops {
+ bool (*filter)(struct mmu_rb_node *node, unsigned long addr,
+ unsigned long len);
+ int (*insert)(void *ops_arg, struct mmu_rb_node *mnode);
+ void (*remove)(void *ops_arg, struct mmu_rb_node *mnode);
+ int (*invalidate)(void *ops_arg, struct mmu_rb_node *node);
+ int (*evict)(void *ops_arg, struct mmu_rb_node *mnode,
+ void *evict_arg, bool *stop);
+};
+
+int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+ struct mmu_rb_ops *ops,
+ struct workqueue_struct *wq,
+ struct mmu_rb_handler **handler);
+void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler);
+int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
+ struct mmu_rb_node *mnode);
+void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg);
+void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
+ struct mmu_rb_node *mnode);
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler,
+ unsigned long addr, unsigned long len);
+
+#endif /* _HFI1_MMU_RB_H */
diff --git a/drivers/infiniband/hw/hfi1/opa_compat.h b/drivers/infiniband/hw/hfi1/opa_compat.h
new file mode 100644
index 000000000000..6ef3c1cbdcd7
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opa_compat.h
@@ -0,0 +1,111 @@
+#ifndef _LINUX_H
+#define _LINUX_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This header file is for OPA-specific definitions which are
+ * required by the HFI driver, and which aren't yet in the Linux
+ * IB core. We'll collect these all here, then merge them into
+ * the kernel when that's convenient.
+ */
+
+/* OPA SMA attribute IDs */
+#define OPA_ATTRIB_ID_CONGESTION_INFO cpu_to_be16(0x008b)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG cpu_to_be16(0x008f)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING cpu_to_be16(0x0090)
+#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091)
+
+/* OPA PMA attribute IDs */
+#define OPA_PM_ATTRIB_ID_PORT_STATUS cpu_to_be16(0x0040)
+#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS cpu_to_be16(0x0041)
+#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS cpu_to_be16(0x0042)
+#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS cpu_to_be16(0x0043)
+#define OPA_PM_ATTRIB_ID_ERROR_INFO cpu_to_be16(0x0044)
+
+/* OPA status codes */
+#define OPA_PM_STATUS_REQUEST_TOO_LARGE cpu_to_be16(0x100)
+
+static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
+{
+ return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
+}
+
+static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
+{
+ return ((ps->portphysstate_portstate &
+ OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
+}
+
+/*
+ * OPA port physical states
+ * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
+ * values.
+ *
+ * When writing, only values 0-3 are valid, other values are ignored.
+ * When reading, 0 is reserved.
+ *
+ * Returned by the ibphys_portstate() routine.
+ */
+enum opa_port_phys_state {
+ IB_PORTPHYSSTATE_NOP = 0,
+ /* 1 is reserved */
+ IB_PORTPHYSSTATE_POLLING = 2,
+ IB_PORTPHYSSTATE_DISABLED = 3,
+ IB_PORTPHYSSTATE_TRAINING = 4,
+ IB_PORTPHYSSTATE_LINKUP = 5,
+ IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
+ IB_PORTPHYSSTATE_PHY_TEST = 7,
+ /* 8 is reserved */
+ OPA_PORTPHYSSTATE_OFFLINE = 9,
+ OPA_PORTPHYSSTATE_GANGED = 10,
+ OPA_PORTPHYSSTATE_TEST = 11,
+ OPA_PORTPHYSSTATE_MAX = 11,
+ /* values 12-15 are reserved/ignored */
+};
+
+#endif /* _LINUX_H */
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
new file mode 100644
index 000000000000..89c68da1c273
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -0,0 +1,1398 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "chip_registers.h"
+#include "aspm.h"
+
+/* link speed vector for Gen3 speed - not in Linux headers */
+#define GEN1_SPEED_VECTOR 0x1
+#define GEN2_SPEED_VECTOR 0x2
+#define GEN3_SPEED_VECTOR 0x3
+
+/*
+ * This file contains PCIe utility routines.
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ */
+static void tune_pcie_caps(struct hfi1_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success. Therefore dd_dev_err() can't be used for error
+ * printing.
+ */
+int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ int ret;
+
+ ret = pci_enable_device(pdev);
+ if (ret) {
+ /*
+ * This can happen (in theory) iff:
+ * We did a chip reset, and then failed to reprogram the
+ * BAR, or the chip reset due to an internal error. We then
+ * unloaded the driver and reloaded it.
+ *
+ * Both reset cases set the BAR back to initial state. For
+ * the latter case, the AER sticky error bit at offset 0x718
+ * should be set, but the Linux kernel doesn't yet know
+ * about that, it appears. If the original BAR was retained
+ * in the kernel data structures, this may be OK.
+ */
+ hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
+ -ret);
+ goto done;
+ }
+
+ ret = pci_request_regions(pdev, DRIVER_NAME);
+ if (ret) {
+ hfi1_early_err(&pdev->dev,
+ "pci_request_regions fails: err %d\n", -ret);
+ goto bail;
+ }
+
+ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (ret) {
+ /*
+ * If the 64 bit setup fails, try 32 bit. Some systems
+ * do not setup 64 bit maps on systems with 2GB or less
+ * memory installed.
+ */
+ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (ret) {
+ hfi1_early_err(&pdev->dev,
+ "Unable to set DMA mask: %d\n", ret);
+ goto bail;
+ }
+ ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+ } else {
+ ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+ }
+ if (ret) {
+ hfi1_early_err(&pdev->dev,
+ "Unable to set DMA consistent mask: %d\n", ret);
+ goto bail;
+ }
+
+ pci_set_master(pdev);
+ (void)pci_enable_pcie_error_reporting(pdev);
+ goto done;
+
+bail:
+ hfi1_pcie_cleanup(pdev);
+done:
+ return ret;
+}
+
+/*
+ * Clean what was done in hfi1_pcie_init()
+ */
+void hfi1_pcie_cleanup(struct pci_dev *pdev)
+{
+ pci_disable_device(pdev);
+ /*
+ * Release regions should be called after the disable. OK to
+ * call if request regions has not been called or failed.
+ */
+ pci_release_regions(pdev);
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ unsigned long len;
+ resource_size_t addr;
+
+ dd->pcidev = pdev;
+ pci_set_drvdata(pdev, dd);
+
+ addr = pci_resource_start(pdev, 0);
+ len = pci_resource_len(pdev, 0);
+
+ /*
+ * The TXE PIO buffers are at the tail end of the chip space.
+ * Cut them off and map them separately.
+ */
+
+ /* sanity check vs expectations */
+ if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
+ dd_dev_err(dd, "chip PIO range does not match\n");
+ return -EINVAL;
+ }
+
+ dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
+ if (!dd->kregbase)
+ return -ENOMEM;
+
+ dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
+ if (!dd->piobase) {
+ iounmap(dd->kregbase);
+ return -ENOMEM;
+ }
+
+ dd->flags |= HFI1_PRESENT; /* now register routines work */
+
+ dd->kregend = dd->kregbase + TXE_PIO_SEND;
+ dd->physaddr = addr; /* used for io_remap, etc. */
+
+ /*
+ * Re-map the chip's RcvArray as write-combining to allow us
+ * to write an entire cacheline worth of entries in one shot.
+ * If this re-map fails, just continue - the RcvArray programming
+ * function will handle both cases.
+ */
+ dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
+ dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
+ dd->chip_rcv_array_count * 8);
+ dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
+ /*
+ * Save BARs and command to rewrite after device reset.
+ */
+ dd->pcibar0 = addr;
+ dd->pcibar1 = addr >> 32;
+ pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
+ pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
+ &dd->pcie_devctl2);
+ pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3);
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
+
+ return 0;
+}
+
+/*
+ * Do PCIe cleanup related to dd, after chip-specific cleanup, etc. Just prior
+ * to releasing the dd memory.
+ * Void because all of the core pcie cleanup functions are void.
+ */
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
+{
+ u64 __iomem *base = (void __iomem *)dd->kregbase;
+
+ dd->flags &= ~HFI1_PRESENT;
+ dd->kregbase = NULL;
+ iounmap(base);
+ if (dd->rcvarray_wc)
+ iounmap(dd->rcvarray_wc);
+ if (dd->piobase)
+ iounmap(dd->piobase);
+}
+
+/*
+ * Do a Function Level Reset (FLR) on the device.
+ * Based on static function drivers/pci/pci.c:pcie_flr().
+ */
+void hfi1_pcie_flr(struct hfi1_devdata *dd)
+{
+ int i;
+ u16 status;
+
+ /* no need to check for the capability - we know the device has it */
+
+ /* wait for Transaction Pending bit to clear, at most a few ms */
+ for (i = 0; i < 4; i++) {
+ if (i)
+ msleep((1 << (i - 1)) * 100);
+
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status);
+ if (!(status & PCI_EXP_DEVSTA_TRPND))
+ goto clear;
+ }
+
+ dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n");
+
+clear:
+ pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
+ PCI_EXP_DEVCTL_BCR_FLR);
+ /* PCIe spec requires the function to be back within 100ms */
+ msleep(100);
+}
+
+static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
+ struct hfi1_msix_entry *hfi1_msix_entry)
+{
+ int ret;
+ int nvec = *msixcnt;
+ struct msix_entry *msix_entry;
+ int i;
+
+ /*
+ * We can't pass hfi1_msix_entry array to msix_setup
+ * so use a dummy msix_entry array and copy the allocated
+ * irq back to the hfi1_msix_entry array.
+ */
+ msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
+ if (!msix_entry) {
+ ret = -ENOMEM;
+ goto do_intx;
+ }
+
+ for (i = 0; i < nvec; i++)
+ msix_entry[i] = hfi1_msix_entry[i].msix;
+
+ ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
+ if (ret < 0)
+ goto free_msix_entry;
+ nvec = ret;
+
+ for (i = 0; i < nvec; i++)
+ hfi1_msix_entry[i].msix = msix_entry[i];
+
+ kfree(msix_entry);
+ *msixcnt = nvec;
+ return;
+
+free_msix_entry:
+ kfree(msix_entry);
+
+do_intx:
+ dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
+ nvec, ret);
+ *msixcnt = 0;
+ hfi1_enable_intx(dd->pcidev);
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_speed(u16 linkstat)
+{
+ u32 speed;
+
+ switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+ default: /* not defined, assume Gen1 */
+ case PCI_EXP_LNKSTA_CLS_2_5GB:
+ speed = 2500; /* Gen 1, 2.5GHz */
+ break;
+ case PCI_EXP_LNKSTA_CLS_5_0GB:
+ speed = 5000; /* Gen 2, 5GHz */
+ break;
+ case GEN3_SPEED_VECTOR:
+ speed = 8000; /* Gen 3, 8GHz */
+ break;
+ }
+ return speed;
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_width(u16 linkstat)
+{
+ return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
+}
+
+/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
+static void update_lbus_info(struct hfi1_devdata *dd)
+{
+ u16 linkstat;
+
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+ dd->lbus_width = extract_width(linkstat);
+ dd->lbus_speed = extract_speed(linkstat);
+ snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+ "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
+}
+
+/*
+ * Read in the current PCIe link width and speed. Find if the link is
+ * Gen3 capable.
+ */
+int pcie_speeds(struct hfi1_devdata *dd)
+{
+ u32 linkcap;
+ struct pci_dev *parent = dd->pcidev->bus->self;
+
+ if (!pci_is_pcie(dd->pcidev)) {
+ dd_dev_err(dd, "Can't find PCI Express capability!\n");
+ return -EINVAL;
+ }
+
+ /* find if our max speed is Gen3 and parent supports Gen3 speeds */
+ dd->link_gen3_capable = 1;
+
+ pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
+ if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
+ dd_dev_info(dd,
+ "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
+ linkcap & PCI_EXP_LNKCAP_SLS);
+ dd->link_gen3_capable = 0;
+ }
+
+ /*
+ * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
+ */
+ if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+ dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
+ dd->link_gen3_capable = 0;
+ }
+
+ /* obtain the link width and current speed */
+ update_lbus_info(dd);
+
+ dd_dev_info(dd, "%s\n", dd->lbus_info);
+
+ return 0;
+}
+
+/*
+ * Returns in *nent:
+ * - actual number of interrupts allocated
+ * - 0 if fell back to INTx.
+ */
+void request_msix(struct hfi1_devdata *dd, u32 *nent,
+ struct hfi1_msix_entry *entry)
+{
+ int pos;
+
+ pos = dd->pcidev->msix_cap;
+ if (*nent && pos) {
+ msix_setup(dd, pos, nent, entry);
+ /* did it, either MSI-X or INTx */
+ } else {
+ *nent = 0;
+ hfi1_enable_intx(dd->pcidev);
+ }
+
+ tune_pcie_caps(dd);
+}
+
+void hfi1_enable_intx(struct pci_dev *pdev)
+{
+ /* first, turn on INTx */
+ pci_intx(pdev, 1);
+ /* then turn off MSI-X */
+ pci_disable_msix(pdev);
+}
+
+/* restore command and BARs after a reset has wiped them out */
+void restore_pci_variables(struct hfi1_devdata *dd)
+{
+ pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
+ pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0);
+ pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1);
+ pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom);
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
+ dd->pcie_devctl2);
+ pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3);
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
+}
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int hfi1_pcie_caps;
+module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+uint aspm_mode = ASPM_MODE_DISABLED;
+module_param_named(aspm, aspm_mode, uint, S_IRUGO);
+MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
+
+static void tune_pcie_caps(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent;
+ u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+ u16 rc_mrrs, ep_mrrs, max_mrrs, ectl;
+
+ /*
+ * Turn on extended tags in DevCtl in case the BIOS has turned it off
+ * to improve WFR SDMA bandwidth
+ */
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl);
+ if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) {
+ dd_dev_info(dd, "Enabling PCIe extended tags\n");
+ ectl |= PCI_EXP_DEVCTL_EXT_TAG;
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl);
+ }
+ /* Find out supported and configured values for parent (root) */
+ parent = dd->pcidev->bus->self;
+ /*
+ * The driver cannot perform the tuning if it does not have
+ * access to the upstream component.
+ */
+ if (!parent)
+ return;
+ if (!pci_is_root_bus(parent->bus)) {
+ dd_dev_info(dd, "Parent not root\n");
+ return;
+ }
+
+ if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
+ return;
+ rc_mpss = parent->pcie_mpss;
+ rc_mps = ffs(pcie_get_mps(parent)) - 8;
+ /* Find out supported and configured values for endpoint (us) */
+ ep_mpss = dd->pcidev->pcie_mpss;
+ ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+ /* Find max payload supported by root, endpoint */
+ if (rc_mpss > ep_mpss)
+ rc_mpss = ep_mpss;
+
+ /* If Supported greater than limit in module param, limit it */
+ if (rc_mpss > (hfi1_pcie_caps & 7))
+ rc_mpss = hfi1_pcie_caps & 7;
+ /* If less than (allowed, supported), bump root payload */
+ if (rc_mpss > rc_mps) {
+ rc_mps = rc_mpss;
+ pcie_set_mps(parent, 128 << rc_mps);
+ }
+ /* If less than (allowed, supported), bump endpoint payload */
+ if (rc_mpss > ep_mps) {
+ ep_mps = rc_mpss;
+ pcie_set_mps(dd->pcidev, 128 << ep_mps);
+ }
+
+ /*
+ * Now the Read Request size.
+ * No field for max supported, but PCIe spec limits it to 4096,
+ * which is code '5' (log2(4096) - 7)
+ */
+ max_mrrs = 5;
+ if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
+ max_mrrs = (hfi1_pcie_caps >> 4) & 7;
+
+ max_mrrs = 128 << max_mrrs;
+ rc_mrrs = pcie_get_readrq(parent);
+ ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+ if (max_mrrs > rc_mrrs) {
+ rc_mrrs = max_mrrs;
+ pcie_set_readrq(parent, rc_mrrs);
+ }
+ if (max_mrrs > ep_mrrs) {
+ ep_mrrs = max_mrrs;
+ pcie_set_readrq(dd->pcidev, ep_mrrs);
+ }
+}
+
+/* End of PCIe capability tuning */
+
+/*
+ * From here through hfi1_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+ pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+ switch (state) {
+ case pci_channel_io_normal:
+ dd_dev_info(dd, "State Normal, ignoring\n");
+ break;
+
+ case pci_channel_io_frozen:
+ dd_dev_info(dd, "State Frozen, requesting reset\n");
+ pci_disable_device(pdev);
+ ret = PCI_ERS_RESULT_NEED_RESET;
+ break;
+
+ case pci_channel_io_perm_failure:
+ if (dd) {
+ dd_dev_info(dd, "State Permanent Failure, disabling\n");
+ /* no more register accesses! */
+ dd->flags &= ~HFI1_PRESENT;
+ hfi1_disable_after_error(dd);
+ }
+ /* else early, or other problem */
+ ret = PCI_ERS_RESULT_DISCONNECT;
+ break;
+
+ default: /* shouldn't happen */
+ dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
+ state);
+ break;
+ }
+ return ret;
+}
+
+static pci_ers_result_t
+pci_mmio_enabled(struct pci_dev *pdev)
+{
+ u64 words = 0U;
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+ pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+ if (dd && dd->pport) {
+ words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
+ if (words == ~0ULL)
+ ret = PCI_ERS_RESULT_NEED_RESET;
+ dd_dev_info(dd,
+ "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n",
+ words, ret);
+ }
+ return ret;
+}
+
+static pci_ers_result_t
+pci_slot_reset(struct pci_dev *pdev)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+ dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
+ return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static pci_ers_result_t
+pci_link_reset(struct pci_dev *pdev)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+ dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
+ return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+pci_resume(struct pci_dev *pdev)
+{
+ struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+ dd_dev_info(dd, "HFI1 resume function called\n");
+ pci_cleanup_aer_uncorrect_error_status(pdev);
+ /*
+ * Running jobs will fail, since it's asynchronous
+ * unlike sysfs-requested reset. Better than
+ * doing nothing.
+ */
+ hfi1_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers hfi1_pci_err_handler = {
+ .error_detected = pci_error_detected,
+ .mmio_enabled = pci_mmio_enabled,
+ .link_reset = pci_link_reset,
+ .slot_reset = pci_slot_reset,
+ .resume = pci_resume,
+};
+
+/*============================================================================*/
+/* PCIe Gen3 support */
+
+/*
+ * This code is separated out because it is expected to be removed in the
+ * final shipping product. If not, then it will be revisited and items
+ * will be moved to more standard locations.
+ */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
+#define DL_STATUS_HFI0 0x1 /* hfi0 firmware download complete */
+#define DL_STATUS_HFI1 0x2 /* hfi1 firmware download complete */
+#define DL_STATUS_BOTH 0x3 /* hfi0 and hfi1 firmware download complete */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
+#define DL_ERR_NONE 0x0 /* no error */
+#define DL_ERR_SWAP_PARITY 0x1 /* parity error in SerDes interrupt */
+ /* or response data */
+#define DL_ERR_DISABLED 0x2 /* hfi disabled */
+#define DL_ERR_SECURITY 0x3 /* security check failed */
+#define DL_ERR_SBUS 0x4 /* SBus status error */
+#define DL_ERR_XFR_PARITY 0x5 /* parity error during ROM transfer*/
+
+/* gasket block secondary bus reset delay */
+#define SBR_DELAY_US 200000 /* 200ms */
+
+/* mask for PCIe capability register lnkctl2 target link speed */
+#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
+
+static uint pcie_target = 3;
+module_param(pcie_target, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
+
+static uint pcie_force;
+module_param(pcie_force, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
+
+static uint pcie_retry = 5;
+module_param(pcie_retry, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
+
+#define UNSET_PSET 255
+#define DEFAULT_DISCRETE_PSET 2 /* discrete HFI */
+#define DEFAULT_MCP_PSET 4 /* MCP HFI */
+static uint pcie_pset = UNSET_PSET;
+module_param(pcie_pset, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
+
+static uint pcie_ctle = 1; /* discrete on, integrated off */
+module_param(pcie_ctle, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_ctle, "PCIe static CTLE mode, bit 0 - discrete on/off, bit 1 - integrated on/off");
+
+/* equalization columns */
+#define PREC 0
+#define ATTN 1
+#define POST 2
+
+/* discrete silicon preliminary equalization values */
+static const u8 discrete_preliminary_eq[11][3] = {
+ /* prec attn post */
+ { 0x00, 0x00, 0x12 }, /* p0 */
+ { 0x00, 0x00, 0x0c }, /* p1 */
+ { 0x00, 0x00, 0x0f }, /* p2 */
+ { 0x00, 0x00, 0x09 }, /* p3 */
+ { 0x00, 0x00, 0x00 }, /* p4 */
+ { 0x06, 0x00, 0x00 }, /* p5 */
+ { 0x09, 0x00, 0x00 }, /* p6 */
+ { 0x06, 0x00, 0x0f }, /* p7 */
+ { 0x09, 0x00, 0x09 }, /* p8 */
+ { 0x0c, 0x00, 0x00 }, /* p9 */
+ { 0x00, 0x00, 0x18 }, /* p10 */
+};
+
+/* integrated silicon preliminary equalization values */
+static const u8 integrated_preliminary_eq[11][3] = {
+ /* prec attn post */
+ { 0x00, 0x1e, 0x07 }, /* p0 */
+ { 0x00, 0x1e, 0x05 }, /* p1 */
+ { 0x00, 0x1e, 0x06 }, /* p2 */
+ { 0x00, 0x1e, 0x04 }, /* p3 */
+ { 0x00, 0x1e, 0x00 }, /* p4 */
+ { 0x03, 0x1e, 0x00 }, /* p5 */
+ { 0x04, 0x1e, 0x00 }, /* p6 */
+ { 0x03, 0x1e, 0x06 }, /* p7 */
+ { 0x03, 0x1e, 0x04 }, /* p8 */
+ { 0x05, 0x1e, 0x00 }, /* p9 */
+ { 0x00, 0x1e, 0x0a }, /* p10 */
+};
+
+static const u8 discrete_ctle_tunings[11][4] = {
+ /* DC LF HF BW */
+ { 0x48, 0x0b, 0x04, 0x04 }, /* p0 */
+ { 0x60, 0x05, 0x0f, 0x0a }, /* p1 */
+ { 0x50, 0x09, 0x06, 0x06 }, /* p2 */
+ { 0x68, 0x05, 0x0f, 0x0a }, /* p3 */
+ { 0x80, 0x05, 0x0f, 0x0a }, /* p4 */
+ { 0x70, 0x05, 0x0f, 0x0a }, /* p5 */
+ { 0x68, 0x05, 0x0f, 0x0a }, /* p6 */
+ { 0x38, 0x0f, 0x00, 0x00 }, /* p7 */
+ { 0x48, 0x09, 0x06, 0x06 }, /* p8 */
+ { 0x60, 0x05, 0x0f, 0x0a }, /* p9 */
+ { 0x38, 0x0f, 0x00, 0x00 }, /* p10 */
+};
+
+static const u8 integrated_ctle_tunings[11][4] = {
+ /* DC LF HF BW */
+ { 0x38, 0x0f, 0x00, 0x00 }, /* p0 */
+ { 0x38, 0x0f, 0x00, 0x00 }, /* p1 */
+ { 0x38, 0x0f, 0x00, 0x00 }, /* p2 */
+ { 0x38, 0x0f, 0x00, 0x00 }, /* p3 */
+ { 0x58, 0x0a, 0x05, 0x05 }, /* p4 */
+ { 0x48, 0x0a, 0x05, 0x05 }, /* p5 */
+ { 0x40, 0x0a, 0x05, 0x05 }, /* p6 */
+ { 0x38, 0x0f, 0x00, 0x00 }, /* p7 */
+ { 0x38, 0x0f, 0x00, 0x00 }, /* p8 */
+ { 0x38, 0x09, 0x06, 0x06 }, /* p9 */
+ { 0x38, 0x0e, 0x01, 0x01 }, /* p10 */
+};
+
+/* helper to format the value to write to hardware */
+#define eq_value(pre, curr, post) \
+ ((((u32)(pre)) << \
+ PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
+ | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
+ | (((u32)(post)) << \
+ PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
+
+/*
+ * Load the given EQ preset table into the PCIe hardware.
+ */
+static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
+ u8 div)
+{
+ struct pci_dev *pdev = dd->pcidev;
+ u32 hit_error = 0;
+ u32 violation;
+ u32 i;
+ u8 c_minus1, c0, c_plus1;
+
+ for (i = 0; i < 11; i++) {
+ /* set index */
+ pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
+ /* write the value */
+ c_minus1 = eq[i][PREC] / div;
+ c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
+ c_plus1 = eq[i][POST] / div;
+ pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
+ eq_value(c_minus1, c0, c_plus1));
+ /* check if these coefficients violate EQ rules */
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
+ &violation);
+ if (violation
+ & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
+ if (hit_error == 0) {
+ dd_dev_err(dd,
+ "Gen3 EQ Table Coefficient rule violations\n");
+ dd_dev_err(dd, " prec attn post\n");
+ }
+ dd_dev_err(dd, " p%02d: %02x %02x %02x\n",
+ i, (u32)eq[i][0], (u32)eq[i][1],
+ (u32)eq[i][2]);
+ dd_dev_err(dd, " %02x %02x %02x\n",
+ (u32)c_minus1, (u32)c0, (u32)c_plus1);
+ hit_error = 1;
+ }
+ }
+ if (hit_error)
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Steps to be done after the PCIe firmware is downloaded and
+ * before the SBR for the Pcie Gen3.
+ * The SBus resource is already being held.
+ */
+static void pcie_post_steps(struct hfi1_devdata *dd)
+{
+ int i;
+
+ set_sbus_fast_mode(dd);
+ /*
+ * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
+ * This avoids a spurious framing error that can otherwise be
+ * generated by the MAC layer.
+ *
+ * Use individual addresses since no broadcast is set up.
+ */
+ for (i = 0; i < NUM_PCIE_SERDES; i++) {
+ sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
+ 0x03, WRITE_SBUS_RECEIVER, 0x00022132);
+ }
+
+ clear_sbus_fast_mode(dd);
+}
+
+/*
+ * Trigger a secondary bus reset (SBR) on ourselves using our parent.
+ *
+ * Based on pci_parent_bus_reset() which is not exported by the
+ * kernel core.
+ */
+static int trigger_sbr(struct hfi1_devdata *dd)
+{
+ struct pci_dev *dev = dd->pcidev;
+ struct pci_dev *pdev;
+
+ /* need a parent */
+ if (!dev->bus->self) {
+ dd_dev_err(dd, "%s: no parent device\n", __func__);
+ return -ENOTTY;
+ }
+
+ /* should not be anyone else on the bus */
+ list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+ if (pdev != dev) {
+ dd_dev_err(dd,
+ "%s: another device is on the same bus\n",
+ __func__);
+ return -ENOTTY;
+ }
+
+ /*
+ * A secondary bus reset (SBR) issues a hot reset to our device.
+ * The following routine does a 1s wait after the reset is dropped
+ * per PCI Trhfa (recovery time). PCIe 3.0 section 6.6.1 -
+ * Conventional Reset, paragraph 3, line 35 also says that a 1s
+ * delay after a reset is required. Per spec requirements,
+ * the link is either working or not after that point.
+ */
+ pci_reset_bridge_secondary_bus(dev->bus->self);
+
+ return 0;
+}
+
+/*
+ * Write the given gasket interrupt register.
+ */
+static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
+ u16 code, u16 data)
+{
+ write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
+ (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) |
+ ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
+}
+
+/*
+ * Tell the gasket logic how to react to the reset.
+ */
+static void arm_gasket_logic(struct hfi1_devdata *dd)
+{
+ u64 reg;
+
+ reg = (((u64)1 << dd->hfi1_id) <<
+ ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) |
+ ((u64)pcie_serdes_broadcast[dd->hfi1_id] <<
+ ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT |
+ ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK |
+ ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) <<
+ ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT);
+ write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
+ /* read back to push the write */
+ read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
+}
+
+/*
+ * CCE_PCIE_CTRL long name helpers
+ * We redefine these shorter macros to use in the code while leaving
+ * chip_registers.h to be autogenerated from the hardware spec.
+ */
+#define LANE_BUNDLE_MASK CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK
+#define LANE_BUNDLE_SHIFT CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT
+#define LANE_DELAY_MASK CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK
+#define LANE_DELAY_SHIFT CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT
+#define MARGIN_OVERWRITE_ENABLE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT
+#define MARGIN_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_SHIFT
+#define MARGIN_G1_G2_OVERWRITE_MASK CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK
+#define MARGIN_G1_G2_OVERWRITE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT
+#define MARGIN_GEN1_GEN2_MASK CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK
+#define MARGIN_GEN1_GEN2_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT
+
+ /*
+ * Write xmt_margin for full-swing (WFR-B) or half-swing (WFR-C).
+ */
+static void write_xmt_margin(struct hfi1_devdata *dd, const char *fname)
+{
+ u64 pcie_ctrl;
+ u64 xmt_margin;
+ u64 xmt_margin_oe;
+ u64 lane_delay;
+ u64 lane_bundle;
+
+ pcie_ctrl = read_csr(dd, CCE_PCIE_CTRL);
+
+ /*
+ * For Discrete, use full-swing.
+ * - PCIe TX defaults to full-swing.
+ * Leave this register as default.
+ * For Integrated, use half-swing
+ * - Copy xmt_margin and xmt_margin_oe
+ * from Gen1/Gen2 to Gen3.
+ */
+ if (dd->pcidev->device == PCI_DEVICE_ID_INTEL1) { /* integrated */
+ /* extract initial fields */
+ xmt_margin = (pcie_ctrl >> MARGIN_GEN1_GEN2_SHIFT)
+ & MARGIN_GEN1_GEN2_MASK;
+ xmt_margin_oe = (pcie_ctrl >> MARGIN_G1_G2_OVERWRITE_SHIFT)
+ & MARGIN_G1_G2_OVERWRITE_MASK;
+ lane_delay = (pcie_ctrl >> LANE_DELAY_SHIFT) & LANE_DELAY_MASK;
+ lane_bundle = (pcie_ctrl >> LANE_BUNDLE_SHIFT)
+ & LANE_BUNDLE_MASK;
+
+ /*
+ * For A0, EFUSE values are not set. Override with the
+ * correct values.
+ */
+ if (is_ax(dd)) {
+ /*
+ * xmt_margin and OverwiteEnabel should be the
+ * same for Gen1/Gen2 and Gen3
+ */
+ xmt_margin = 0x5;
+ xmt_margin_oe = 0x1;
+ lane_delay = 0xF; /* Delay 240ns. */
+ lane_bundle = 0x0; /* Set to 1 lane. */
+ }
+
+ /* overwrite existing values */
+ pcie_ctrl = (xmt_margin << MARGIN_GEN1_GEN2_SHIFT)
+ | (xmt_margin_oe << MARGIN_G1_G2_OVERWRITE_SHIFT)
+ | (xmt_margin << MARGIN_SHIFT)
+ | (xmt_margin_oe << MARGIN_OVERWRITE_ENABLE_SHIFT)
+ | (lane_delay << LANE_DELAY_SHIFT)
+ | (lane_bundle << LANE_BUNDLE_SHIFT);
+
+ write_csr(dd, CCE_PCIE_CTRL, pcie_ctrl);
+ }
+
+ dd_dev_dbg(dd, "%s: program XMT margin, CcePcieCtrl 0x%llx\n",
+ fname, pcie_ctrl);
+}
+
+/*
+ * Do all the steps needed to transition the PCIe link to Gen3 speed.
+ */
+int do_pcie_gen3_transition(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent = dd->pcidev->bus->self;
+ u64 fw_ctrl;
+ u64 reg, therm;
+ u32 reg32, fs, lf;
+ u32 status, err;
+ int ret;
+ int do_retry, retry_count = 0;
+ int intnum = 0;
+ uint default_pset;
+ u16 target_vector, target_speed;
+ u16 lnkctl2, vendor;
+ u8 div;
+ const u8 (*eq)[3];
+ const u8 (*ctle_tunings)[4];
+ uint static_ctle_mode;
+ int return_error = 0;
+
+ /* PCIe Gen3 is for the ASIC only */
+ if (dd->icode != ICODE_RTL_SILICON)
+ return 0;
+
+ if (pcie_target == 1) { /* target Gen1 */
+ target_vector = GEN1_SPEED_VECTOR;
+ target_speed = 2500;
+ } else if (pcie_target == 2) { /* target Gen2 */
+ target_vector = GEN2_SPEED_VECTOR;
+ target_speed = 5000;
+ } else if (pcie_target == 3) { /* target Gen3 */
+ target_vector = GEN3_SPEED_VECTOR;
+ target_speed = 8000;
+ } else {
+ /* off or invalid target - skip */
+ dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
+ return 0;
+ }
+
+ /* if already at target speed, done (unless forced) */
+ if (dd->lbus_speed == target_speed) {
+ dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
+ pcie_target,
+ pcie_force ? "re-doing anyway" : "skipping");
+ if (!pcie_force)
+ return 0;
+ }
+
+ /*
+ * The driver cannot do the transition if it has no access to the
+ * upstream component
+ */
+ if (!parent) {
+ dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n",
+ __func__);
+ return 0;
+ }
+
+ /*
+ * Do the Gen3 transition. Steps are those of the PCIe Gen3
+ * recipe.
+ */
+
+ /* step 1: pcie link working in gen1/gen2 */
+
+ /* step 2: if either side is not capable of Gen3, done */
+ if (pcie_target == 3 && !dd->link_gen3_capable) {
+ dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
+ ret = -ENOSYS;
+ goto done_no_mutex;
+ }
+
+ /* hold the SBus resource across the firmware download and SBR */
+ ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+ if (ret) {
+ dd_dev_err(dd, "%s: unable to acquire SBus resource\n",
+ __func__);
+ return ret;
+ }
+
+ /* make sure thermal polling is not causing interrupts */
+ therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
+ if (therm) {
+ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+ msleep(100);
+ dd_dev_info(dd, "%s: Disabled therm polling\n",
+ __func__);
+ }
+
+retry:
+ /* the SBus download will reset the spico for thermal */
+
+ /* step 3: download SBus Master firmware */
+ /* step 4: download PCIe Gen3 SerDes firmware */
+ dd_dev_info(dd, "%s: downloading firmware\n", __func__);
+ ret = load_pcie_firmware(dd);
+ if (ret) {
+ /* do not proceed if the firmware cannot be downloaded */
+ return_error = 1;
+ goto done;
+ }
+
+ /* step 5: set up device parameter settings */
+ dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
+
+ /*
+ * PcieCfgSpcie1 - Link Control 3
+ * Leave at reset value. No need to set PerfEq - link equalization
+ * will be performed automatically after the SBR when the target
+ * speed is 8GT/s.
+ */
+
+ /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
+
+ /* step 5a: Set Synopsys Port Logic registers */
+
+ /*
+ * PcieCfgRegPl2 - Port Force Link
+ *
+ * Set the low power field to 0x10 to avoid unnecessary power
+ * management messages. All other fields are zero.
+ */
+ reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
+
+ /*
+ * PcieCfgRegPl100 - Gen3 Control
+ *
+ * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
+ * turn on PcieCfgRegPl100.EqEieosCnt
+ * Everything else zero.
+ */
+ reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
+
+ /*
+ * PcieCfgRegPl101 - Gen3 EQ FS and LF
+ * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
+ * PcieCfgRegPl103 - Gen3 EQ Preset Index
+ * PcieCfgRegPl105 - Gen3 EQ Status
+ *
+ * Give initial EQ settings.
+ */
+ if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
+ /* 1000mV, FS=24, LF = 8 */
+ fs = 24;
+ lf = 8;
+ div = 3;
+ eq = discrete_preliminary_eq;
+ default_pset = DEFAULT_DISCRETE_PSET;
+ ctle_tunings = discrete_ctle_tunings;
+ /* bit 0 - discrete on/off */
+ static_ctle_mode = pcie_ctle & 0x1;
+ } else {
+ /* 400mV, FS=29, LF = 9 */
+ fs = 29;
+ lf = 9;
+ div = 1;
+ eq = integrated_preliminary_eq;
+ default_pset = DEFAULT_MCP_PSET;
+ ctle_tunings = integrated_ctle_tunings;
+ /* bit 1 - integrated on/off */
+ static_ctle_mode = (pcie_ctle >> 1) & 0x1;
+ }
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
+ (fs <<
+ PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) |
+ (lf <<
+ PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
+ ret = load_eq_table(dd, eq, fs, div);
+ if (ret)
+ goto done;
+
+ /*
+ * PcieCfgRegPl106 - Gen3 EQ Control
+ *
+ * Set Gen3EqPsetReqVec, leave other fields 0.
+ */
+ if (pcie_pset == UNSET_PSET)
+ pcie_pset = default_pset;
+ if (pcie_pset > 10) { /* valid range is 0-10, inclusive */
+ dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
+ __func__, pcie_pset, default_pset);
+ pcie_pset = default_pset;
+ }
+ dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
+ ((1 << pcie_pset) <<
+ PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) |
+ PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK |
+ PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
+
+ /*
+ * step 5b: Do post firmware download steps via SBus
+ */
+ dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
+ pcie_post_steps(dd);
+
+ /*
+ * step 5c: Program gasket interrupts
+ */
+ /* set the Rx Bit Rate to REFCLK ratio */
+ write_gasket_interrupt(dd, intnum++, 0x0006, 0x0050);
+ /* disable pCal for PCIe Gen3 RX equalization */
+ /* select adaptive or static CTLE */
+ write_gasket_interrupt(dd, intnum++, 0x0026,
+ 0x5b01 | (static_ctle_mode << 3));
+ /*
+ * Enable iCal for PCIe Gen3 RX equalization, and set which
+ * evaluation of RX_EQ_EVAL will launch the iCal procedure.
+ */
+ write_gasket_interrupt(dd, intnum++, 0x0026, 0x5202);
+
+ if (static_ctle_mode) {
+ /* apply static CTLE tunings */
+ u8 pcie_dc, pcie_lf, pcie_hf, pcie_bw;
+
+ pcie_dc = ctle_tunings[pcie_pset][0];
+ pcie_lf = ctle_tunings[pcie_pset][1];
+ pcie_hf = ctle_tunings[pcie_pset][2];
+ pcie_bw = ctle_tunings[pcie_pset][3];
+ write_gasket_interrupt(dd, intnum++, 0x0026, 0x0200 | pcie_dc);
+ write_gasket_interrupt(dd, intnum++, 0x0026, 0x0100 | pcie_lf);
+ write_gasket_interrupt(dd, intnum++, 0x0026, 0x0000 | pcie_hf);
+ write_gasket_interrupt(dd, intnum++, 0x0026, 0x5500 | pcie_bw);
+ }
+
+ /* terminate list */
+ write_gasket_interrupt(dd, intnum++, 0x0000, 0x0000);
+
+ /*
+ * step 5d: program XMT margin
+ */
+ write_xmt_margin(dd, __func__);
+
+ /*
+ * step 5e: disable active state power management (ASPM). It
+ * will be enabled if required later
+ */
+ dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
+ aspm_hw_disable_l1(dd);
+
+ /*
+ * step 5f: clear DirectSpeedChange
+ * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
+ * change in the speed target from starting before we are ready.
+ * This field defaults to 0 and we are not changing it, so nothing
+ * needs to be done.
+ */
+
+ /* step 5g: Set target link speed */
+ /*
+ * Set target link speed to be target on both device and parent.
+ * On setting the parent: Some system BIOSs "helpfully" set the
+ * parent target speed to Gen2 to match the ASIC's initial speed.
+ * We can set the target Gen3 because we have already checked
+ * that it is Gen3 capable earlier.
+ */
+ dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
+ pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
+ dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+ (u32)lnkctl2);
+ /* only write to parent if target is not as high as ours */
+ if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
+ lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+ lnkctl2 |= target_vector;
+ dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+ (u32)lnkctl2);
+ pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
+ } else {
+ dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
+ }
+
+ dd_dev_info(dd, "%s: setting target link speed\n", __func__);
+ pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
+ dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+ (u32)lnkctl2);
+ lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+ lnkctl2 |= target_vector;
+ dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+ (u32)lnkctl2);
+ pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
+
+ /* step 5h: arm gasket logic */
+ /* hold DC in reset across the SBR */
+ write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+ (void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
+ /* save firmware control across the SBR */
+ fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
+
+ dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
+ arm_gasket_logic(dd);
+
+ /*
+ * step 6: quiesce PCIe link
+ * The chip has already been reset, so there will be no traffic
+ * from the chip. Linux has no easy way to enforce that it will
+ * not try to access the device, so we just need to hope it doesn't
+ * do it while we are doing the reset.
+ */
+
+ /*
+ * step 7: initiate the secondary bus reset (SBR)
+ * step 8: hardware brings the links back up
+ * step 9: wait for link speed transition to be complete
+ */
+ dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
+ ret = trigger_sbr(dd);
+ if (ret)
+ goto done;
+
+ /* step 10: decide what to do next */
+
+ /* check if we can read PCI space */
+ ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
+ if (ret) {
+ dd_dev_info(dd,
+ "%s: read of VendorID failed after SBR, err %d\n",
+ __func__, ret);
+ return_error = 1;
+ goto done;
+ }
+ if (vendor == 0xffff) {
+ dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
+ return_error = 1;
+ ret = -EIO;
+ goto done;
+ }
+
+ /* restore PCI space registers we know were reset */
+ dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
+ restore_pci_variables(dd);
+ /* restore firmware control */
+ write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
+
+ /*
+ * Check the gasket block status.
+ *
+ * This is the first CSR read after the SBR. If the read returns
+ * all 1s (fails), the link did not make it back.
+ *
+ * Once we're sure we can read and write, clear the DC reset after
+ * the SBR. Then check for any per-lane errors. Then look over
+ * the status.
+ */
+ reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
+ dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
+ if (reg == ~0ull) { /* PCIe read failed/timeout */
+ dd_dev_err(dd, "SBR failed - unable to read from device\n");
+ return_error = 1;
+ ret = -ENOSYS;
+ goto done;
+ }
+
+ /* clear the DC reset */
+ write_csr(dd, CCE_DC_CTRL, 0);
+
+ /* Set the LED off */
+ setextled(dd, 0);
+
+ /* check for any per-lane errors */
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
+ dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
+
+ /* extract status, look for our HFI */
+ status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
+ & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
+ if ((status & (1 << dd->hfi1_id)) == 0) {
+ dd_dev_err(dd,
+ "%s: gasket status 0x%x, expecting 0x%x\n",
+ __func__, status, 1 << dd->hfi1_id);
+ ret = -EIO;
+ goto done;
+ }
+
+ /* extract error */
+ err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
+ & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
+ if (err) {
+ dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
+ ret = -EIO;
+ goto done;
+ }
+
+ /* update our link information cache */
+ update_lbus_info(dd);
+ dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
+ dd->lbus_info);
+
+ if (dd->lbus_speed != target_speed) { /* not target */
+ /* maybe retry */
+ do_retry = retry_count < pcie_retry;
+ dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
+ pcie_target, do_retry ? ", retrying" : "");
+ retry_count++;
+ if (do_retry) {
+ msleep(100); /* allow time to settle */
+ goto retry;
+ }
+ ret = -EIO;
+ }
+
+done:
+ if (therm) {
+ write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+ msleep(100);
+ dd_dev_info(dd, "%s: Re-enable therm polling\n",
+ __func__);
+ }
+ release_chip_resource(dd, CR_SBUS);
+done_no_mutex:
+ /* return no error if it is OK to be at current speed */
+ if (ret && !return_error) {
+ dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
+ ret = 0;
+ }
+
+ dd_dev_info(dd, "%s: done\n", __func__);
+ return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
new file mode 100644
index 000000000000..ac1bf4a73571
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -0,0 +1,2105 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+
+#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
+
+#define SC(name) SEND_CTXT_##name
+/*
+ * Send Context functions
+ */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
+
+/*
+ * Set the CM reset bit and wait for it to clear. Use the provided
+ * sendctrl register. This routine has no locking.
+ */
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
+{
+ write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
+ while (1) {
+ udelay(1);
+ sendctrl = read_csr(dd, SEND_CTRL);
+ if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
+ break;
+ }
+}
+
+/* defined in header release 48 and higher */
+#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+ << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
+#endif
+
+/* global control of PIO send */
+void pio_send_control(struct hfi1_devdata *dd, int op)
+{
+ u64 reg, mask;
+ unsigned long flags;
+ int write = 1; /* write sendctrl back */
+ int flush = 0; /* re-read sendctrl to make sure it is flushed */
+
+ spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+ reg = read_csr(dd, SEND_CTRL);
+ switch (op) {
+ case PSC_GLOBAL_ENABLE:
+ reg |= SEND_CTRL_SEND_ENABLE_SMASK;
+ /* Fall through */
+ case PSC_DATA_VL_ENABLE:
+ /* Disallow sending on VLs not enabled */
+ mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) <<
+ SEND_CTRL_UNSUPPORTED_VL_SHIFT;
+ reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
+ break;
+ case PSC_GLOBAL_DISABLE:
+ reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
+ break;
+ case PSC_GLOBAL_VLARB_ENABLE:
+ reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+ break;
+ case PSC_GLOBAL_VLARB_DISABLE:
+ reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+ break;
+ case PSC_CM_RESET:
+ __cm_reset(dd, reg);
+ write = 0; /* CSR already written (and flushed) */
+ break;
+ case PSC_DATA_VL_DISABLE:
+ reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
+ flush = 1;
+ break;
+ default:
+ dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
+ break;
+ }
+
+ if (write) {
+ write_csr(dd, SEND_CTRL, reg);
+ if (flush)
+ (void)read_csr(dd, SEND_CTRL); /* flush write */
+ }
+
+ spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/* number of send context memory pools */
+#define NUM_SC_POOLS 2
+
+/* Send Context Size (SCS) wildcards */
+#define SCS_POOL_0 -1
+#define SCS_POOL_1 -2
+
+/* Send Context Count (SCC) wildcards */
+#define SCC_PER_VL -1
+#define SCC_PER_CPU -2
+#define SCC_PER_KRCVQ -3
+
+/* Send Context Size (SCS) constants */
+#define SCS_ACK_CREDITS 32
+#define SCS_VL15_CREDITS 102 /* 3 pkts of 2048B data + 128B header */
+
+#define PIO_THRESHOLD_CEILING 4096
+
+#define PIO_WAIT_BATCH_SIZE 5
+
+/* default send context sizes */
+static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
+ [SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */
+ .count = SCC_PER_VL }, /* one per NUMA */
+ [SC_ACK] = { .size = SCS_ACK_CREDITS,
+ .count = SCC_PER_KRCVQ },
+ [SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */
+ .count = SCC_PER_CPU }, /* one per CPU */
+ [SC_VL15] = { .size = SCS_VL15_CREDITS,
+ .count = 1 },
+
+};
+
+/* send context memory pool configuration */
+struct mem_pool_config {
+ int centipercent; /* % of memory, in 100ths of 1% */
+ int absolute_blocks; /* absolute block count */
+};
+
+/* default memory pool configuration: 100% in pool 0 */
+static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
+ /* centi%, abs blocks */
+ { 10000, -1 }, /* pool 0 */
+ { 0, -1 }, /* pool 1 */
+};
+
+/* memory pool information, used when calculating final sizes */
+struct mem_pool_info {
+ int centipercent; /*
+ * 100th of 1% of memory to use, -1 if blocks
+ * already set
+ */
+ int count; /* count of contexts in the pool */
+ int blocks; /* block size of the pool */
+ int size; /* context size, in blocks */
+};
+
+/*
+ * Convert a pool wildcard to a valid pool index. The wildcards
+ * start at -1 and increase negatively. Map them as:
+ * -1 => 0
+ * -2 => 1
+ * etc.
+ *
+ * Return -1 on non-wildcard input, otherwise convert to a pool number.
+ */
+static int wildcard_to_pool(int wc)
+{
+ if (wc >= 0)
+ return -1; /* non-wildcard */
+ return -wc - 1;
+}
+
+static const char *sc_type_names[SC_MAX] = {
+ "kernel",
+ "ack",
+ "user",
+ "vl15"
+};
+
+static const char *sc_type_name(int index)
+{
+ if (index < 0 || index >= SC_MAX)
+ return "unknown";
+ return sc_type_names[index];
+}
+
+/*
+ * Read the send context memory pool configuration and send context
+ * size configuration. Replace any wildcards and come up with final
+ * counts and sizes for the send context types.
+ */
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
+{
+ struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
+ int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
+ int total_contexts = 0;
+ int fixed_blocks;
+ int pool_blocks;
+ int used_blocks;
+ int cp_total; /* centipercent total */
+ int ab_total; /* absolute block total */
+ int extra;
+ int i;
+
+ /*
+ * When SDMA is enabled, kernel context pio packet size is capped by
+ * "piothreshold". Reduce pio buffer allocation for kernel context by
+ * setting it to a fixed size. The allocation allows 3-deep buffering
+ * of the largest pio packets plus up to 128 bytes header, sufficient
+ * to maintain verbs performance.
+ *
+ * When SDMA is disabled, keep the default pooling allocation.
+ */
+ if (HFI1_CAP_IS_KSET(SDMA)) {
+ u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ?
+ piothreshold : PIO_THRESHOLD_CEILING;
+ sc_config_sizes[SC_KERNEL].size =
+ 3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE;
+ }
+
+ /*
+ * Step 0:
+ * - copy the centipercents/absolute sizes from the pool config
+ * - sanity check these values
+ * - add up centipercents, then later check for full value
+ * - add up absolute blocks, then later check for over-commit
+ */
+ cp_total = 0;
+ ab_total = 0;
+ for (i = 0; i < NUM_SC_POOLS; i++) {
+ int cp = sc_mem_pool_config[i].centipercent;
+ int ab = sc_mem_pool_config[i].absolute_blocks;
+
+ /*
+ * A negative value is "unused" or "invalid". Both *can*
+ * be valid, but centipercent wins, so check that first
+ */
+ if (cp >= 0) { /* centipercent valid */
+ cp_total += cp;
+ } else if (ab >= 0) { /* absolute blocks valid */
+ ab_total += ab;
+ } else { /* neither valid */
+ dd_dev_err(
+ dd,
+ "Send context memory pool %d: both the block count and centipercent are invalid\n",
+ i);
+ return -EINVAL;
+ }
+
+ mem_pool_info[i].centipercent = cp;
+ mem_pool_info[i].blocks = ab;
+ }
+
+ /* do not use both % and absolute blocks for different pools */
+ if (cp_total != 0 && ab_total != 0) {
+ dd_dev_err(
+ dd,
+ "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
+ return -EINVAL;
+ }
+
+ /* if any percentages are present, they must add up to 100% x 100 */
+ if (cp_total != 0 && cp_total != 10000) {
+ dd_dev_err(
+ dd,
+ "Send context memory pool centipercent is %d, expecting 10000\n",
+ cp_total);
+ return -EINVAL;
+ }
+
+ /* the absolute pool total cannot be more than the mem total */
+ if (ab_total > total_blocks) {
+ dd_dev_err(
+ dd,
+ "Send context memory pool absolute block count %d is larger than the memory size %d\n",
+ ab_total, total_blocks);
+ return -EINVAL;
+ }
+
+ /*
+ * Step 2:
+ * - copy from the context size config
+ * - replace context type wildcard counts with real values
+ * - add up non-memory pool block sizes
+ * - add up memory pool user counts
+ */
+ fixed_blocks = 0;
+ for (i = 0; i < SC_MAX; i++) {
+ int count = sc_config_sizes[i].count;
+ int size = sc_config_sizes[i].size;
+ int pool;
+
+ /*
+ * Sanity check count: Either a positive value or
+ * one of the expected wildcards is valid. The positive
+ * value is checked later when we compare against total
+ * memory available.
+ */
+ if (i == SC_ACK) {
+ count = dd->n_krcv_queues;
+ } else if (i == SC_KERNEL) {
+ count = INIT_SC_PER_VL * num_vls;
+ } else if (count == SCC_PER_CPU) {
+ count = dd->num_rcv_contexts - dd->n_krcv_queues;
+ } else if (count < 0) {
+ dd_dev_err(
+ dd,
+ "%s send context invalid count wildcard %d\n",
+ sc_type_name(i), count);
+ return -EINVAL;
+ }
+ if (total_contexts + count > dd->chip_send_contexts)
+ count = dd->chip_send_contexts - total_contexts;
+
+ total_contexts += count;
+
+ /*
+ * Sanity check pool: The conversion will return a pool
+ * number or -1 if a fixed (non-negative) value. The fixed
+ * value is checked later when we compare against
+ * total memory available.
+ */
+ pool = wildcard_to_pool(size);
+ if (pool == -1) { /* non-wildcard */
+ fixed_blocks += size * count;
+ } else if (pool < NUM_SC_POOLS) { /* valid wildcard */
+ mem_pool_info[pool].count += count;
+ } else { /* invalid wildcard */
+ dd_dev_err(
+ dd,
+ "%s send context invalid pool wildcard %d\n",
+ sc_type_name(i), size);
+ return -EINVAL;
+ }
+
+ dd->sc_sizes[i].count = count;
+ dd->sc_sizes[i].size = size;
+ }
+ if (fixed_blocks > total_blocks) {
+ dd_dev_err(
+ dd,
+ "Send context fixed block count, %u, larger than total block count %u\n",
+ fixed_blocks, total_blocks);
+ return -EINVAL;
+ }
+
+ /* step 3: calculate the blocks in the pools, and pool context sizes */
+ pool_blocks = total_blocks - fixed_blocks;
+ if (ab_total > pool_blocks) {
+ dd_dev_err(
+ dd,
+ "Send context fixed pool sizes, %u, larger than pool block count %u\n",
+ ab_total, pool_blocks);
+ return -EINVAL;
+ }
+ /* subtract off the fixed pool blocks */
+ pool_blocks -= ab_total;
+
+ for (i = 0; i < NUM_SC_POOLS; i++) {
+ struct mem_pool_info *pi = &mem_pool_info[i];
+
+ /* % beats absolute blocks */
+ if (pi->centipercent >= 0)
+ pi->blocks = (pool_blocks * pi->centipercent) / 10000;
+
+ if (pi->blocks == 0 && pi->count != 0) {
+ dd_dev_err(
+ dd,
+ "Send context memory pool %d has %u contexts, but no blocks\n",
+ i, pi->count);
+ return -EINVAL;
+ }
+ if (pi->count == 0) {
+ /* warn about wasted blocks */
+ if (pi->blocks != 0)
+ dd_dev_err(
+ dd,
+ "Send context memory pool %d has %u blocks, but zero contexts\n",
+ i, pi->blocks);
+ pi->size = 0;
+ } else {
+ pi->size = pi->blocks / pi->count;
+ }
+ }
+
+ /* step 4: fill in the context type sizes from the pool sizes */
+ used_blocks = 0;
+ for (i = 0; i < SC_MAX; i++) {
+ if (dd->sc_sizes[i].size < 0) {
+ unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
+
+ WARN_ON_ONCE(pool >= NUM_SC_POOLS);
+ dd->sc_sizes[i].size = mem_pool_info[pool].size;
+ }
+ /* make sure we are not larger than what is allowed by the HW */
+#define PIO_MAX_BLOCKS 1024
+ if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
+ dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
+
+ /* calculate our total usage */
+ used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
+ }
+ extra = total_blocks - used_blocks;
+ if (extra != 0)
+ dd_dev_info(dd, "unused send context blocks: %d\n", extra);
+
+ return total_contexts;
+}
+
+int init_send_contexts(struct hfi1_devdata *dd)
+{
+ u16 base;
+ int ret, i, j, context;
+
+ ret = init_credit_return(dd);
+ if (ret)
+ return ret;
+
+ dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
+ GFP_KERNEL);
+ dd->send_contexts = kcalloc(dd->num_send_contexts,
+ sizeof(struct send_context_info),
+ GFP_KERNEL);
+ if (!dd->send_contexts || !dd->hw_to_sw) {
+ kfree(dd->hw_to_sw);
+ kfree(dd->send_contexts);
+ free_credit_return(dd);
+ return -ENOMEM;
+ }
+
+ /* hardware context map starts with invalid send context indices */
+ for (i = 0; i < TXE_NUM_CONTEXTS; i++)
+ dd->hw_to_sw[i] = INVALID_SCI;
+
+ /*
+ * All send contexts have their credit sizes. Allocate credits
+ * for each context one after another from the global space.
+ */
+ context = 0;
+ base = 1;
+ for (i = 0; i < SC_MAX; i++) {
+ struct sc_config_sizes *scs = &dd->sc_sizes[i];
+
+ for (j = 0; j < scs->count; j++) {
+ struct send_context_info *sci =
+ &dd->send_contexts[context];
+ sci->type = i;
+ sci->base = base;
+ sci->credits = scs->size;
+
+ context++;
+ base += scs->size;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Allocate a software index and hardware context of the given type.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
+ u32 *hw_context)
+{
+ struct send_context_info *sci;
+ u32 index;
+ u32 context;
+
+ for (index = 0, sci = &dd->send_contexts[0];
+ index < dd->num_send_contexts; index++, sci++) {
+ if (sci->type == type && sci->allocated == 0) {
+ sci->allocated = 1;
+ /* use a 1:1 mapping, but make them non-equal */
+ context = dd->chip_send_contexts - index - 1;
+ dd->hw_to_sw[context] = index;
+ *sw_index = index;
+ *hw_context = context;
+ return 0; /* success */
+ }
+ }
+ dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
+ return -ENOSPC;
+}
+
+/*
+ * Free the send context given by its software index.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
+{
+ struct send_context_info *sci;
+
+ sci = &dd->send_contexts[sw_index];
+ if (!sci->allocated) {
+ dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
+ __func__, sw_index, hw_context);
+ }
+ sci->allocated = 0;
+ dd->hw_to_sw[hw_context] = INVALID_SCI;
+}
+
+/* return the base context of a context in a group */
+static inline u32 group_context(u32 context, u32 group)
+{
+ return (context >> group) << group;
+}
+
+/* return the size of a group */
+static inline u32 group_size(u32 group)
+{
+ return 1 << group;
+}
+
+/*
+ * Obtain the credit return addresses, kernel virtual and physical, for the
+ * given sc.
+ *
+ * To understand this routine:
+ * o va and pa are arrays of struct credit_return. One for each physical
+ * send context, per NUMA.
+ * o Each send context always looks in its relative location in a struct
+ * credit_return for its credit return.
+ * o Each send context in a group must have its return address CSR programmed
+ * with the same value. Use the address of the first send context in the
+ * group.
+ */
+static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
+{
+ u32 gc = group_context(sc->hw_context, sc->group);
+ u32 index = sc->hw_context & 0x7;
+
+ sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
+ *pa = (unsigned long)
+ &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
+}
+
+/*
+ * Work queue function triggered in error interrupt routine for
+ * kernel contexts.
+ */
+static void sc_halted(struct work_struct *work)
+{
+ struct send_context *sc;
+
+ sc = container_of(work, struct send_context, halt_work);
+ sc_restart(sc);
+}
+
+/*
+ * Calculate PIO block threshold for this send context using the given MTU.
+ * Trigger a return when one MTU plus optional header of credits remain.
+ *
+ * Parameter mtu is in bytes.
+ * Parameter hdrqentsize is in DWORDs.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
+{
+ u32 release_credits;
+ u32 threshold;
+
+ /* add in the header size, then divide by the PIO block size */
+ mtu += hdrqentsize << 2;
+ release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
+
+ /* check against this context's credits */
+ if (sc->credits <= release_credits)
+ threshold = 1;
+ else
+ threshold = sc->credits - release_credits;
+
+ return threshold;
+}
+
+/*
+ * Calculate credit threshold in terms of percent of the allocated credits.
+ * Trigger when unreturned credits equal or exceed the percentage of the whole.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+{
+ return (sc->credits * percent) / 100;
+}
+
+/*
+ * Set the credit return threshold.
+ */
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
+{
+ unsigned long flags;
+ u32 old_threshold;
+ int force_return = 0;
+
+ spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+
+ old_threshold = (sc->credit_ctrl >>
+ SC(CREDIT_CTRL_THRESHOLD_SHIFT))
+ & SC(CREDIT_CTRL_THRESHOLD_MASK);
+
+ if (new_threshold != old_threshold) {
+ sc->credit_ctrl =
+ (sc->credit_ctrl
+ & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
+ | ((new_threshold
+ & SC(CREDIT_CTRL_THRESHOLD_MASK))
+ << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
+ write_kctxt_csr(sc->dd, sc->hw_context,
+ SC(CREDIT_CTRL), sc->credit_ctrl);
+
+ /* force a credit return on change to avoid a possible stall */
+ force_return = 1;
+ }
+
+ spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+
+ if (force_return)
+ sc_return_credits(sc);
+}
+
+/*
+ * set_pio_integrity
+ *
+ * Set the CHECK_ENABLE register for the send context 'sc'.
+ */
+void set_pio_integrity(struct send_context *sc)
+{
+ struct hfi1_devdata *dd = sc->dd;
+ u64 reg = 0;
+ u32 hw_context = sc->hw_context;
+ int type = sc->type;
+
+ /*
+ * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
+ * we're snooping.
+ */
+ if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
+ dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
+ reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
+
+ write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
+}
+
+static u32 get_buffers_allocated(struct send_context *sc)
+{
+ int cpu;
+ u32 ret = 0;
+
+ for_each_possible_cpu(cpu)
+ ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
+ return ret;
+}
+
+static void reset_buffers_allocated(struct send_context *sc)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ (*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
+}
+
+/*
+ * Allocate a NUMA relative send context structure of the given type along
+ * with a HW context.
+ */
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+ uint hdrqentsize, int numa)
+{
+ struct send_context_info *sci;
+ struct send_context *sc = NULL;
+ dma_addr_t pa;
+ unsigned long flags;
+ u64 reg;
+ u32 thresh;
+ u32 sw_index;
+ u32 hw_context;
+ int ret;
+ u8 opval, opmask;
+
+ /* do not allocate while frozen */
+ if (dd->flags & HFI1_FROZEN)
+ return NULL;
+
+ sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa);
+ if (!sc)
+ return NULL;
+
+ sc->buffers_allocated = alloc_percpu(u32);
+ if (!sc->buffers_allocated) {
+ kfree(sc);
+ dd_dev_err(dd,
+ "Cannot allocate buffers_allocated per cpu counters\n"
+ );
+ return NULL;
+ }
+
+ spin_lock_irqsave(&dd->sc_lock, flags);
+ ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
+ if (ret) {
+ spin_unlock_irqrestore(&dd->sc_lock, flags);
+ free_percpu(sc->buffers_allocated);
+ kfree(sc);
+ return NULL;
+ }
+
+ sci = &dd->send_contexts[sw_index];
+ sci->sc = sc;
+
+ sc->dd = dd;
+ sc->node = numa;
+ sc->type = type;
+ spin_lock_init(&sc->alloc_lock);
+ spin_lock_init(&sc->release_lock);
+ spin_lock_init(&sc->credit_ctrl_lock);
+ INIT_LIST_HEAD(&sc->piowait);
+ INIT_WORK(&sc->halt_work, sc_halted);
+ init_waitqueue_head(&sc->halt_wait);
+
+ /* grouping is always single context for now */
+ sc->group = 0;
+
+ sc->sw_index = sw_index;
+ sc->hw_context = hw_context;
+ cr_group_addresses(sc, &pa);
+ sc->credits = sci->credits;
+
+/* PIO Send Memory Address details */
+#define PIO_ADDR_CONTEXT_MASK 0xfful
+#define PIO_ADDR_CONTEXT_SHIFT 16
+ sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
+ << PIO_ADDR_CONTEXT_SHIFT);
+
+ /* set base and credits */
+ reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
+ << SC(CTRL_CTXT_DEPTH_SHIFT))
+ | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
+ << SC(CTRL_CTXT_BASE_SHIFT));
+ write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
+
+ set_pio_integrity(sc);
+
+ /* unmask all errors */
+ write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
+
+ /* set the default partition key */
+ write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
+ (SC(CHECK_PARTITION_KEY_VALUE_MASK) &
+ DEFAULT_PKEY) <<
+ SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
+
+ /* per context type checks */
+ if (type == SC_USER) {
+ opval = USER_OPCODE_CHECK_VAL;
+ opmask = USER_OPCODE_CHECK_MASK;
+ } else {
+ opval = OPCODE_CHECK_VAL_DISABLED;
+ opmask = OPCODE_CHECK_MASK_DISABLED;
+ }
+
+ /* set the send context check opcode mask and value */
+ write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
+ ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
+ ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
+
+ /* set up credit return */
+ reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
+ write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
+
+ /*
+ * Calculate the initial credit return threshold.
+ *
+ * For Ack contexts, set a threshold for half the credits.
+ * For User contexts use the given percentage. This has been
+ * sanitized on driver start-up.
+ * For Kernel contexts, use the default MTU plus a header
+ * or half the credits, whichever is smaller. This should
+ * work for both the 3-deep buffering allocation and the
+ * pooling allocation.
+ */
+ if (type == SC_ACK) {
+ thresh = sc_percent_to_threshold(sc, 50);
+ } else if (type == SC_USER) {
+ thresh = sc_percent_to_threshold(sc,
+ user_credit_return_threshold);
+ } else { /* kernel */
+ thresh = min(sc_percent_to_threshold(sc, 50),
+ sc_mtu_to_threshold(sc, hfi1_max_mtu,
+ hdrqentsize));
+ }
+ reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
+ /* add in early return */
+ if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
+ reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+ else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
+ reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+
+ /* set up write-through credit_ctrl */
+ sc->credit_ctrl = reg;
+ write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
+
+ /* User send contexts should not allow sending on VL15 */
+ if (type == SC_USER) {
+ reg = 1ULL << 15;
+ write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
+ }
+
+ spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+ /*
+ * Allocate shadow ring to track outstanding PIO buffers _after_
+ * unlocking. We don't know the size until the lock is held and
+ * we can't allocate while the lock is held. No one is using
+ * the context yet, so allocate it now.
+ *
+ * User contexts do not get a shadow ring.
+ */
+ if (type != SC_USER) {
+ /*
+ * Size the shadow ring 1 larger than the number of credits
+ * so head == tail can mean empty.
+ */
+ sc->sr_size = sci->credits + 1;
+ sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
+ sc->sr_size, GFP_KERNEL, numa);
+ if (!sc->sr) {
+ sc_free(sc);
+ return NULL;
+ }
+ }
+
+ hfi1_cdbg(PIO,
+ "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
+ sw_index,
+ hw_context,
+ sc_type_name(type),
+ sc->group,
+ sc->credits,
+ sc->credit_ctrl,
+ thresh);
+
+ return sc;
+}
+
+/* free a per-NUMA send context structure */
+void sc_free(struct send_context *sc)
+{
+ struct hfi1_devdata *dd;
+ unsigned long flags;
+ u32 sw_index;
+ u32 hw_context;
+
+ if (!sc)
+ return;
+
+ sc->flags |= SCF_IN_FREE; /* ensure no restarts */
+ dd = sc->dd;
+ if (!list_empty(&sc->piowait))
+ dd_dev_err(dd, "piowait list not empty!\n");
+ sw_index = sc->sw_index;
+ hw_context = sc->hw_context;
+ sc_disable(sc); /* make sure the HW is disabled */
+ flush_work(&sc->halt_work);
+
+ spin_lock_irqsave(&dd->sc_lock, flags);
+ dd->send_contexts[sw_index].sc = NULL;
+
+ /* clear/disable all registers set in sc_alloc */
+ write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
+ write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
+ write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
+ write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
+ write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
+ write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
+ write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
+
+ /* release the index and context for re-use */
+ sc_hw_free(dd, sw_index, hw_context);
+ spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+ kfree(sc->sr);
+ free_percpu(sc->buffers_allocated);
+ kfree(sc);
+}
+
+/* disable the context */
+void sc_disable(struct send_context *sc)
+{
+ u64 reg;
+ unsigned long flags;
+ struct pio_buf *pbuf;
+
+ if (!sc)
+ return;
+
+ /* do all steps, even if already disabled */
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
+ reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
+ sc->flags &= ~SCF_ENABLED;
+ sc_wait_for_packet_egress(sc, 1);
+ write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+ /*
+ * Flush any waiters. Once the context is disabled,
+ * credit return interrupts are stopped (although there
+ * could be one in-process when the context is disabled).
+ * Wait one microsecond for any lingering interrupts, then
+ * proceed with the flush.
+ */
+ udelay(1);
+ spin_lock_irqsave(&sc->release_lock, flags);
+ if (sc->sr) { /* this context has a shadow ring */
+ while (sc->sr_tail != sc->sr_head) {
+ pbuf = &sc->sr[sc->sr_tail].pbuf;
+ if (pbuf->cb)
+ (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
+ sc->sr_tail++;
+ if (sc->sr_tail >= sc->sr_size)
+ sc->sr_tail = 0;
+ }
+ }
+ spin_unlock_irqrestore(&sc->release_lock, flags);
+}
+
+/* return SendEgressCtxtStatus.PacketOccupancy */
+#define packet_occupancy(r) \
+ (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
+ >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
+
+/* is egress halted on the context? */
+#define egress_halted(r) \
+ ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
+
+/* wait for packet egress, optionally pause for credit return */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
+{
+ struct hfi1_devdata *dd = sc->dd;
+ u64 reg = 0;
+ u64 reg_prev;
+ u32 loop = 0;
+
+ while (1) {
+ reg_prev = reg;
+ reg = read_csr(dd, sc->hw_context * 8 +
+ SEND_EGRESS_CTXT_STATUS);
+ /* done if egress is stopped */
+ if (egress_halted(reg))
+ break;
+ reg = packet_occupancy(reg);
+ if (reg == 0)
+ break;
+ /* counter is reset if occupancy count changes */
+ if (reg != reg_prev)
+ loop = 0;
+ if (loop > 50000) {
+ /* timed out - bounce the link */
+ dd_dev_err(dd,
+ "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n",
+ __func__, sc->sw_index,
+ sc->hw_context, (u32)reg);
+ queue_work(dd->pport->hfi1_wq,
+ &dd->pport->link_bounce_work);
+ break;
+ }
+ loop++;
+ udelay(1);
+ }
+
+ if (pause)
+ /* Add additional delay to ensure chip returns all credits */
+ pause_for_credit_return(dd);
+}
+
+void sc_wait(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < dd->num_send_contexts; i++) {
+ struct send_context *sc = dd->send_contexts[i].sc;
+
+ if (!sc)
+ continue;
+ sc_wait_for_packet_egress(sc, 0);
+ }
+}
+
+/*
+ * Restart a context after it has been halted due to error.
+ *
+ * If the first step fails - wait for the halt to be asserted, return early.
+ * Otherwise complain about timeouts but keep going.
+ *
+ * It is expected that allocations (enabled flag bit) have been shut off
+ * already (only applies to kernel contexts).
+ */
+int sc_restart(struct send_context *sc)
+{
+ struct hfi1_devdata *dd = sc->dd;
+ u64 reg;
+ u32 loop;
+ int count;
+
+ /* bounce off if not halted, or being free'd */
+ if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
+ return -EINVAL;
+
+ dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
+ sc->hw_context);
+
+ /*
+ * Step 1: Wait for the context to actually halt.
+ *
+ * The error interrupt is asynchronous to actually setting halt
+ * on the context.
+ */
+ loop = 0;
+ while (1) {
+ reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
+ if (reg & SC(STATUS_CTXT_HALTED_SMASK))
+ break;
+ if (loop > 100) {
+ dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
+ __func__, sc->sw_index, sc->hw_context);
+ return -ETIME;
+ }
+ loop++;
+ udelay(1);
+ }
+
+ /*
+ * Step 2: Ensure no users are still trying to write to PIO.
+ *
+ * For kernel contexts, we have already turned off buffer allocation.
+ * Now wait for the buffer count to go to zero.
+ *
+ * For user contexts, the user handling code has cut off write access
+ * to the context's PIO pages before calling this routine and will
+ * restore write access after this routine returns.
+ */
+ if (sc->type != SC_USER) {
+ /* kernel context */
+ loop = 0;
+ while (1) {
+ count = get_buffers_allocated(sc);
+ if (count == 0)
+ break;
+ if (loop > 100) {
+ dd_dev_err(dd,
+ "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
+ __func__, sc->sw_index,
+ sc->hw_context, count);
+ }
+ loop++;
+ udelay(1);
+ }
+ }
+
+ /*
+ * Step 3: Wait for all packets to egress.
+ * This is done while disabling the send context
+ *
+ * Step 4: Disable the context
+ *
+ * This is a superset of the halt. After the disable, the
+ * errors can be cleared.
+ */
+ sc_disable(sc);
+
+ /*
+ * Step 5: Enable the context
+ *
+ * This enable will clear the halted flag and per-send context
+ * error flags.
+ */
+ return sc_enable(sc);
+}
+
+/*
+ * PIO freeze processing. To be called after the TXE block is fully frozen.
+ * Go through all frozen send contexts and disable them. The contexts are
+ * already stopped by the freeze.
+ */
+void pio_freeze(struct hfi1_devdata *dd)
+{
+ struct send_context *sc;
+ int i;
+
+ for (i = 0; i < dd->num_send_contexts; i++) {
+ sc = dd->send_contexts[i].sc;
+ /*
+ * Don't disable unallocated, unfrozen, or user send contexts.
+ * User send contexts will be disabled when the process
+ * calls into the driver to reset its context.
+ */
+ if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+ continue;
+
+ /* only need to disable, the context is already stopped */
+ sc_disable(sc);
+ }
+}
+
+/*
+ * Unfreeze PIO for kernel send contexts. The precondition for calling this
+ * is that all PIO send contexts have been disabled and the SPC freeze has
+ * been cleared. Now perform the last step and re-enable each kernel context.
+ * User (PSM) processing will occur when PSM calls into the kernel to
+ * acknowledge the freeze.
+ */
+void pio_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+ struct send_context *sc;
+ int i;
+
+ for (i = 0; i < dd->num_send_contexts; i++) {
+ sc = dd->send_contexts[i].sc;
+ if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+ continue;
+
+ sc_enable(sc); /* will clear the sc frozen flag */
+ }
+}
+
+/*
+ * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
+ * Returns:
+ * -ETIMEDOUT - if we wait too long
+ * -EIO - if there was an error
+ */
+static int pio_init_wait_progress(struct hfi1_devdata *dd)
+{
+ u64 reg;
+ int max, count = 0;
+
+ /* max is the longest possible HW init time / delay */
+ max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
+ while (1) {
+ reg = read_csr(dd, SEND_PIO_INIT_CTXT);
+ if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
+ break;
+ if (count >= max)
+ return -ETIMEDOUT;
+ udelay(5);
+ count++;
+ }
+
+ return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
+}
+
+/*
+ * Reset all of the send contexts to their power-on state. Used
+ * only during manual init - no lock against sc_enable needed.
+ */
+void pio_reset_all(struct hfi1_devdata *dd)
+{
+ int ret;
+
+ /* make sure the init engine is not busy */
+ ret = pio_init_wait_progress(dd);
+ /* ignore any timeout */
+ if (ret == -EIO) {
+ /* clear the error */
+ write_csr(dd, SEND_PIO_ERR_CLEAR,
+ SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
+ }
+
+ /* reset init all */
+ write_csr(dd, SEND_PIO_INIT_CTXT,
+ SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
+ udelay(2);
+ ret = pio_init_wait_progress(dd);
+ if (ret < 0) {
+ dd_dev_err(dd,
+ "PIO send context init %s while initializing all PIO blocks\n",
+ ret == -ETIMEDOUT ? "is stuck" : "had an error");
+ }
+}
+
+/* enable the context */
+int sc_enable(struct send_context *sc)
+{
+ u64 sc_ctrl, reg, pio;
+ struct hfi1_devdata *dd;
+ unsigned long flags;
+ int ret = 0;
+
+ if (!sc)
+ return -EINVAL;
+ dd = sc->dd;
+
+ /*
+ * Obtain the allocator lock to guard against any allocation
+ * attempts (which should not happen prior to context being
+ * enabled). On the release/disable side we don't need to
+ * worry about locking since the releaser will not do anything
+ * if the context accounting values have not changed.
+ */
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+ if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
+ goto unlock; /* already enabled */
+
+ /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
+
+ *sc->hw_free = 0;
+ sc->free = 0;
+ sc->alloc_free = 0;
+ sc->fill = 0;
+ sc->sr_head = 0;
+ sc->sr_tail = 0;
+ sc->flags = 0;
+ /* the alloc lock insures no fast path allocation */
+ reset_buffers_allocated(sc);
+
+ /*
+ * Clear all per-context errors. Some of these will be set when
+ * we are re-enabling after a context halt. Now that the context
+ * is disabled, the halt will not clear until after the PIO init
+ * engine runs below.
+ */
+ reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
+ if (reg)
+ write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg);
+
+ /*
+ * The HW PIO initialization engine can handle only one init
+ * request at a time. Serialize access to each device's engine.
+ */
+ spin_lock(&dd->sc_init_lock);
+ /*
+ * Since access to this code block is serialized and
+ * each access waits for the initialization to complete
+ * before releasing the lock, the PIO initialization engine
+ * should not be in use, so we don't have to wait for the
+ * InProgress bit to go down.
+ */
+ pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
+ SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
+ SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
+ write_csr(dd, SEND_PIO_INIT_CTXT, pio);
+ /*
+ * Wait until the engine is done. Give the chip the required time
+ * so, hopefully, we read the register just once.
+ */
+ udelay(2);
+ ret = pio_init_wait_progress(dd);
+ spin_unlock(&dd->sc_init_lock);
+ if (ret) {
+ dd_dev_err(dd,
+ "sctxt%u(%u): Context not enabled due to init failure %d\n",
+ sc->sw_index, sc->hw_context, ret);
+ goto unlock;
+ }
+
+ /*
+ * All is well. Enable the context.
+ */
+ sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
+ write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
+ /*
+ * Read SendCtxtCtrl to force the write out and prevent a timing
+ * hazard where a PIO write may reach the context before the enable.
+ */
+ read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+ sc->flags |= SCF_ENABLED;
+
+unlock:
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+ return ret;
+}
+
+/* force a credit return on the context */
+void sc_return_credits(struct send_context *sc)
+{
+ if (!sc)
+ return;
+
+ /* a 0->1 transition schedules a credit return */
+ write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
+ SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
+ /*
+ * Ensure that the write is flushed and the credit return is
+ * scheduled. We care more about the 0 -> 1 transition.
+ */
+ read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
+ /* set back to 0 for next time */
+ write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
+}
+
+/* allow all in-flight packets to drain on the context */
+void sc_flush(struct send_context *sc)
+{
+ if (!sc)
+ return;
+
+ sc_wait_for_packet_egress(sc, 1);
+}
+
+/* drop all packets on the context, no waiting until they are sent */
+void sc_drop(struct send_context *sc)
+{
+ if (!sc)
+ return;
+
+ dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
+ __func__, sc->sw_index, sc->hw_context);
+}
+
+/*
+ * Start the software reaction to a context halt or SPC freeze:
+ * - mark the context as halted or frozen
+ * - stop buffer allocations
+ *
+ * Called from the error interrupt. Other work is deferred until
+ * out of the interrupt.
+ */
+void sc_stop(struct send_context *sc, int flag)
+{
+ unsigned long flags;
+
+ /* mark the context */
+ sc->flags |= flag;
+
+ /* stop buffer allocations */
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ sc->flags &= ~SCF_ENABLED;
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+ wake_up(&sc->halt_wait);
+}
+
+#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32))
+#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
+
+/*
+ * The send context buffer "allocator".
+ *
+ * @sc: the PIO send context we are allocating from
+ * @len: length of whole packet - including PBC - in dwords
+ * @cb: optional callback to call when the buffer is finished sending
+ * @arg: argument for cb
+ *
+ * Return a pointer to a PIO buffer if successful, NULL if not enough room.
+ */
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+ pio_release_cb cb, void *arg)
+{
+ struct pio_buf *pbuf = NULL;
+ unsigned long flags;
+ unsigned long avail;
+ unsigned long blocks = dwords_to_blocks(dw_len);
+ unsigned long start_fill;
+ int trycount = 0;
+ u32 head, next;
+
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ if (!(sc->flags & SCF_ENABLED)) {
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+ goto done;
+ }
+
+retry:
+ avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
+ if (blocks > avail) {
+ /* not enough room */
+ if (unlikely(trycount)) { /* already tried to get more room */
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+ goto done;
+ }
+ /* copy from receiver cache line and recalculate */
+ sc->alloc_free = ACCESS_ONCE(sc->free);
+ avail =
+ (unsigned long)sc->credits -
+ (sc->fill - sc->alloc_free);
+ if (blocks > avail) {
+ /* still no room, actively update */
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+ sc_release_update(sc);
+ spin_lock_irqsave(&sc->alloc_lock, flags);
+ sc->alloc_free = ACCESS_ONCE(sc->free);
+ trycount++;
+ goto retry;
+ }
+ }
+
+ /* there is enough room */
+
+ preempt_disable();
+ this_cpu_inc(*sc->buffers_allocated);
+
+ /* read this once */
+ head = sc->sr_head;
+
+ /* "allocate" the buffer */
+ start_fill = sc->fill;
+ sc->fill += blocks;
+
+ /*
+ * Fill the parts that the releaser looks at before moving the head.
+ * The only necessary piece is the sent_at field. The credits
+ * we have just allocated cannot have been returned yet, so the
+ * cb and arg will not be looked at for a "while". Put them
+ * on this side of the memory barrier anyway.
+ */
+ pbuf = &sc->sr[head].pbuf;
+ pbuf->sent_at = sc->fill;
+ pbuf->cb = cb;
+ pbuf->arg = arg;
+ pbuf->sc = sc; /* could be filled in at sc->sr init time */
+ /* make sure this is in memory before updating the head */
+
+ /* calculate next head index, do not store */
+ next = head + 1;
+ if (next >= sc->sr_size)
+ next = 0;
+ /*
+ * update the head - must be last! - the releaser can look at fields
+ * in pbuf once we move the head
+ */
+ smp_wmb();
+ sc->sr_head = next;
+ spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+ /* finish filling in the buffer outside the lock */
+ pbuf->start = sc->base_addr + ((start_fill % sc->credits)
+ * PIO_BLOCK_SIZE);
+ pbuf->size = sc->credits * PIO_BLOCK_SIZE;
+ pbuf->end = sc->base_addr + pbuf->size;
+ pbuf->block_count = blocks;
+ pbuf->qw_written = 0;
+ pbuf->carry_bytes = 0;
+ pbuf->carry.val64 = 0;
+done:
+ return pbuf;
+}
+
+/*
+ * There are at least two entities that can turn on credit return
+ * interrupts and they can overlap. Avoid problems by implementing
+ * a count scheme that is enforced by a lock. The lock is needed because
+ * the count and CSR write must be paired.
+ */
+
+/*
+ * Start credit return interrupts. This is managed by a count. If already
+ * on, just increment the count.
+ */
+void sc_add_credit_return_intr(struct send_context *sc)
+{
+ unsigned long flags;
+
+ /* lock must surround both the count change and the CSR update */
+ spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+ if (sc->credit_intr_count == 0) {
+ sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+ write_kctxt_csr(sc->dd, sc->hw_context,
+ SC(CREDIT_CTRL), sc->credit_ctrl);
+ }
+ sc->credit_intr_count++;
+ spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * Stop credit return interrupts. This is managed by a count. Decrement the
+ * count, if the last user, then turn the credit interrupts off.
+ */
+void sc_del_credit_return_intr(struct send_context *sc)
+{
+ unsigned long flags;
+
+ WARN_ON(sc->credit_intr_count == 0);
+
+ /* lock must surround both the count change and the CSR update */
+ spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+ sc->credit_intr_count--;
+ if (sc->credit_intr_count == 0) {
+ sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+ write_kctxt_csr(sc->dd, sc->hw_context,
+ SC(CREDIT_CTRL), sc->credit_ctrl);
+ }
+ spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * The caller must be careful when calling this. All needint calls
+ * must be paired with !needint.
+ */
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
+{
+ if (needint)
+ sc_add_credit_return_intr(sc);
+ else
+ sc_del_credit_return_intr(sc);
+ trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
+ if (needint) {
+ mmiowb();
+ sc_return_credits(sc);
+ }
+}
+
+/**
+ * sc_piobufavail - callback when a PIO buffer is available
+ * @sc: the send context
+ *
+ * This is called from the interrupt handler when a PIO buffer is
+ * available after hfi1_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+static void sc_piobufavail(struct send_context *sc)
+{
+ struct hfi1_devdata *dd = sc->dd;
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ struct list_head *list;
+ struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
+ struct rvt_qp *qp;
+ struct hfi1_qp_priv *priv;
+ unsigned long flags;
+ unsigned i, n = 0;
+
+ if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
+ dd->send_contexts[sc->sw_index].type != SC_VL15)
+ return;
+ list = &sc->piowait;
+ /*
+ * Note: checking that the piowait list is empty and clearing
+ * the buffer available interrupt needs to be atomic or we
+ * could end up with QPs on the wait list with the interrupt
+ * disabled.
+ */
+ write_seqlock_irqsave(&dev->iowait_lock, flags);
+ while (!list_empty(list)) {
+ struct iowait *wait;
+
+ if (n == ARRAY_SIZE(qps))
+ break;
+ wait = list_first_entry(list, struct iowait, list);
+ qp = iowait_to_qp(wait);
+ priv = qp->priv;
+ list_del_init(&priv->s_iowait.list);
+ /* refcount held until actual wake up */
+ qps[n++] = qp;
+ }
+ /*
+ * If there had been waiters and there are more
+ * insure that we redo the force to avoid a potential hang.
+ */
+ if (n) {
+ hfi1_sc_wantpiobuf_intr(sc, 0);
+ if (!list_empty(list))
+ hfi1_sc_wantpiobuf_intr(sc, 1);
+ }
+ write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+ for (i = 0; i < n; i++)
+ hfi1_qp_wakeup(qps[i],
+ RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN);
+}
+
+/* translate a send credit update to a bit code of reasons */
+static inline int fill_code(u64 hw_free)
+{
+ int code = 0;
+
+ if (hw_free & CR_STATUS_SMASK)
+ code |= PRC_STATUS_ERR;
+ if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
+ code |= PRC_PBC;
+ if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
+ code |= PRC_THRESHOLD;
+ if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
+ code |= PRC_FILL_ERR;
+ if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
+ code |= PRC_SC_DISABLE;
+ return code;
+}
+
+/* use the jiffies compare to get the wrap right */
+#define sent_before(a, b) time_before(a, b) /* a < b */
+
+/*
+ * The send context buffer "releaser".
+ */
+void sc_release_update(struct send_context *sc)
+{
+ struct pio_buf *pbuf;
+ u64 hw_free;
+ u32 head, tail;
+ unsigned long old_free;
+ unsigned long free;
+ unsigned long extra;
+ unsigned long flags;
+ int code;
+
+ if (!sc)
+ return;
+
+ spin_lock_irqsave(&sc->release_lock, flags);
+ /* update free */
+ hw_free = le64_to_cpu(*sc->hw_free); /* volatile read */
+ old_free = sc->free;
+ extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
+ - (old_free & CR_COUNTER_MASK))
+ & CR_COUNTER_MASK;
+ free = old_free + extra;
+ trace_hfi1_piofree(sc, extra);
+
+ /* call sent buffer callbacks */
+ code = -1; /* code not yet set */
+ head = ACCESS_ONCE(sc->sr_head); /* snapshot the head */
+ tail = sc->sr_tail;
+ while (head != tail) {
+ pbuf = &sc->sr[tail].pbuf;
+
+ if (sent_before(free, pbuf->sent_at)) {
+ /* not sent yet */
+ break;
+ }
+ if (pbuf->cb) {
+ if (code < 0) /* fill in code on first user */
+ code = fill_code(hw_free);
+ (*pbuf->cb)(pbuf->arg, code);
+ }
+
+ tail++;
+ if (tail >= sc->sr_size)
+ tail = 0;
+ }
+ sc->sr_tail = tail;
+ /* make sure tail is updated before free */
+ smp_wmb();
+ sc->free = free;
+ spin_unlock_irqrestore(&sc->release_lock, flags);
+ sc_piobufavail(sc);
+}
+
+/*
+ * Send context group releaser. Argument is the send context that caused
+ * the interrupt. Called from the send context interrupt handler.
+ *
+ * Call release on all contexts in the group.
+ *
+ * This routine takes the sc_lock without an irqsave because it is only
+ * called from an interrupt handler. Adjust if that changes.
+ */
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
+{
+ struct send_context *sc;
+ u32 sw_index;
+ u32 gc, gc_end;
+
+ spin_lock(&dd->sc_lock);
+ sw_index = dd->hw_to_sw[hw_context];
+ if (unlikely(sw_index >= dd->num_send_contexts)) {
+ dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
+ __func__, hw_context, sw_index);
+ goto done;
+ }
+ sc = dd->send_contexts[sw_index].sc;
+ if (unlikely(!sc))
+ goto done;
+
+ gc = group_context(hw_context, sc->group);
+ gc_end = gc + group_size(sc->group);
+ for (; gc < gc_end; gc++) {
+ sw_index = dd->hw_to_sw[gc];
+ if (unlikely(sw_index >= dd->num_send_contexts)) {
+ dd_dev_err(dd,
+ "%s: invalid hw (%u) to sw (%u) mapping\n",
+ __func__, hw_context, sw_index);
+ continue;
+ }
+ sc_release_update(dd->send_contexts[sw_index].sc);
+ }
+done:
+ spin_unlock(&dd->sc_lock);
+}
+
+/*
+ * pio_select_send_context_vl() - select send context
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ * This function returns a send context based on the selector and a vl.
+ * The mapping fields are protected by RCU
+ */
+struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
+ u32 selector, u8 vl)
+{
+ struct pio_vl_map *m;
+ struct pio_map_elem *e;
+ struct send_context *rval;
+
+ /*
+ * NOTE This should only happen if SC->VL changed after the initial
+ * checks on the QP/AH
+ * Default will return VL0's send context below
+ */
+ if (unlikely(vl >= num_vls)) {
+ rval = NULL;
+ goto done;
+ }
+
+ rcu_read_lock();
+ m = rcu_dereference(dd->pio_map);
+ if (unlikely(!m)) {
+ rcu_read_unlock();
+ return dd->vld[0].sc;
+ }
+ e = m->map[vl & m->mask];
+ rval = e->ksc[selector & e->mask];
+ rcu_read_unlock();
+
+done:
+ rval = !rval ? dd->vld[0].sc : rval;
+ return rval;
+}
+
+/*
+ * pio_select_send_context_sc() - select send context
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ * This function returns an send context based on the selector and an sc
+ */
+struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
+ u32 selector, u8 sc5)
+{
+ u8 vl = sc_to_vlt(dd, sc5);
+
+ return pio_select_send_context_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void pio_map_free(struct pio_vl_map *m)
+{
+ int i;
+
+ for (i = 0; m && i < m->actual_vls; i++)
+ kfree(m->map[i]);
+ kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void pio_map_rcu_callback(struct rcu_head *list)
+{
+ struct pio_vl_map *m = container_of(list, struct pio_vl_map, list);
+
+ pio_map_free(m);
+}
+
+/*
+ * Set credit return threshold for the kernel send context
+ */
+static void set_threshold(struct hfi1_devdata *dd, int scontext, int i)
+{
+ u32 thres;
+
+ thres = min(sc_percent_to_threshold(dd->kernel_send_context[scontext],
+ 50),
+ sc_mtu_to_threshold(dd->kernel_send_context[scontext],
+ dd->vld[i].mtu,
+ dd->rcd[0]->rcvhdrqentsize));
+ sc_set_cr_threshold(dd->kernel_send_context[scontext], thres);
+}
+
+/*
+ * pio_map_init - called when #vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_scontexts: per vl send context mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_scontexts is used to specify a non-uniform vl/send context
+ * loading. NULL implies auto computing the loading and giving each
+ * VL an uniform distribution of send contexts per VL.
+ *
+ * The auto algorithm computers the sc_per_vl and the number of extra
+ * send contexts. Any extra send contexts are added from the last VL
+ * on down
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_send_contexts are non-power of 2, the
+ * array sizes in the struct pio_vl_map and the struct pio_map_elem are
+ * rounded up to the next highest power of 2 and the first entry is
+ * reused in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is not
+ * chaged.
+ *
+ */
+int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts)
+{
+ int i, j;
+ int extra, sc_per_vl;
+ int scontext = 1;
+ int num_kernel_send_contexts = 0;
+ u8 lvl_scontexts[OPA_MAX_VLS];
+ struct pio_vl_map *oldmap, *newmap;
+
+ if (!vl_scontexts) {
+ for (i = 0; i < dd->num_send_contexts; i++)
+ if (dd->send_contexts[i].type == SC_KERNEL)
+ num_kernel_send_contexts++;
+ /* truncate divide */
+ sc_per_vl = num_kernel_send_contexts / num_vls;
+ /* extras */
+ extra = num_kernel_send_contexts % num_vls;
+ vl_scontexts = lvl_scontexts;
+ /* add extras from last vl down */
+ for (i = num_vls - 1; i >= 0; i--, extra--)
+ vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0);
+ }
+ /* build new map */
+ newmap = kzalloc(sizeof(*newmap) +
+ roundup_pow_of_two(num_vls) *
+ sizeof(struct pio_map_elem *),
+ GFP_KERNEL);
+ if (!newmap)
+ goto bail;
+ newmap->actual_vls = num_vls;
+ newmap->vls = roundup_pow_of_two(num_vls);
+ newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+ for (i = 0; i < newmap->vls; i++) {
+ /* save for wrap around */
+ int first_scontext = scontext;
+
+ if (i < newmap->actual_vls) {
+ int sz = roundup_pow_of_two(vl_scontexts[i]);
+
+ /* only allocate once */
+ newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) +
+ sz * sizeof(struct
+ send_context *),
+ GFP_KERNEL);
+ if (!newmap->map[i])
+ goto bail;
+ newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+ /*
+ * assign send contexts and
+ * adjust credit return threshold
+ */
+ for (j = 0; j < sz; j++) {
+ if (dd->kernel_send_context[scontext]) {
+ newmap->map[i]->ksc[j] =
+ dd->kernel_send_context[scontext];
+ set_threshold(dd, scontext, i);
+ }
+ if (++scontext >= first_scontext +
+ vl_scontexts[i])
+ /* wrap back to first send context */
+ scontext = first_scontext;
+ }
+ } else {
+ /* just re-use entry without allocating */
+ newmap->map[i] = newmap->map[i % num_vls];
+ }
+ scontext = first_scontext + vl_scontexts[i];
+ }
+ /* newmap in hand, save old map */
+ spin_lock_irq(&dd->pio_map_lock);
+ oldmap = rcu_dereference_protected(dd->pio_map,
+ lockdep_is_held(&dd->pio_map_lock));
+
+ /* publish newmap */
+ rcu_assign_pointer(dd->pio_map, newmap);
+
+ spin_unlock_irq(&dd->pio_map_lock);
+ /* success, free any old map after grace period */
+ if (oldmap)
+ call_rcu(&oldmap->list, pio_map_rcu_callback);
+ return 0;
+bail:
+ /* free any partial allocation */
+ pio_map_free(newmap);
+ return -ENOMEM;
+}
+
+void free_pio_map(struct hfi1_devdata *dd)
+{
+ /* Free PIO map if allocated */
+ if (rcu_access_pointer(dd->pio_map)) {
+ spin_lock_irq(&dd->pio_map_lock);
+ pio_map_free(rcu_access_pointer(dd->pio_map));
+ RCU_INIT_POINTER(dd->pio_map, NULL);
+ spin_unlock_irq(&dd->pio_map_lock);
+ synchronize_rcu();
+ }
+ kfree(dd->kernel_send_context);
+ dd->kernel_send_context = NULL;
+}
+
+int init_pervl_scs(struct hfi1_devdata *dd)
+{
+ int i;
+ u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */
+ u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */
+ u32 ctxt;
+ struct hfi1_pportdata *ppd = dd->pport;
+
+ dd->vld[15].sc = sc_alloc(dd, SC_VL15,
+ dd->rcd[0]->rcvhdrqentsize, dd->node);
+ if (!dd->vld[15].sc)
+ return -ENOMEM;
+
+ hfi1_init_ctxt(dd->vld[15].sc);
+ dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
+
+ dd->kernel_send_context = kzalloc_node(dd->num_send_contexts *
+ sizeof(struct send_context *),
+ GFP_KERNEL, dd->node);
+ if (!dd->kernel_send_context)
+ goto freesc15;
+
+ dd->kernel_send_context[0] = dd->vld[15].sc;
+
+ for (i = 0; i < num_vls; i++) {
+ /*
+ * Since this function does not deal with a specific
+ * receive context but we need the RcvHdrQ entry size,
+ * use the size from rcd[0]. It is guaranteed to be
+ * valid at this point and will remain the same for all
+ * receive contexts.
+ */
+ dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
+ dd->rcd[0]->rcvhdrqentsize, dd->node);
+ if (!dd->vld[i].sc)
+ goto nomem;
+ dd->kernel_send_context[i + 1] = dd->vld[i].sc;
+ hfi1_init_ctxt(dd->vld[i].sc);
+ /* non VL15 start with the max MTU */
+ dd->vld[i].mtu = hfi1_max_mtu;
+ }
+ for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
+ dd->kernel_send_context[i + 1] =
+ sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node);
+ if (!dd->kernel_send_context[i + 1])
+ goto nomem;
+ hfi1_init_ctxt(dd->kernel_send_context[i + 1]);
+ }
+
+ sc_enable(dd->vld[15].sc);
+ ctxt = dd->vld[15].sc->hw_context;
+ mask = all_vl_mask & ~(1LL << 15);
+ write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+ dd_dev_info(dd,
+ "Using send context %u(%u) for VL15\n",
+ dd->vld[15].sc->sw_index, ctxt);
+
+ for (i = 0; i < num_vls; i++) {
+ sc_enable(dd->vld[i].sc);
+ ctxt = dd->vld[i].sc->hw_context;
+ mask = all_vl_mask & ~(data_vls_mask);
+ write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+ }
+ for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) {
+ sc_enable(dd->kernel_send_context[i + 1]);
+ ctxt = dd->kernel_send_context[i + 1]->hw_context;
+ mask = all_vl_mask & ~(data_vls_mask);
+ write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+ }
+
+ if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
+ goto nomem;
+ return 0;
+
+nomem:
+ for (i = 0; i < num_vls; i++) {
+ sc_free(dd->vld[i].sc);
+ dd->vld[i].sc = NULL;
+ }
+
+ for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
+ sc_free(dd->kernel_send_context[i + 1]);
+
+ kfree(dd->kernel_send_context);
+ dd->kernel_send_context = NULL;
+
+freesc15:
+ sc_free(dd->vld[15].sc);
+ return -ENOMEM;
+}
+
+int init_credit_return(struct hfi1_devdata *dd)
+{
+ int ret;
+ int num_numa;
+ int i;
+
+ num_numa = num_online_nodes();
+ /* enforce the expectation that the numas are compact */
+ for (i = 0; i < num_numa; i++) {
+ if (!node_online(i)) {
+ dd_dev_err(dd, "NUMA nodes are not compact\n");
+ ret = -EINVAL;
+ goto done;
+ }
+ }
+
+ dd->cr_base = kcalloc(
+ num_numa,
+ sizeof(struct credit_return_base),
+ GFP_KERNEL);
+ if (!dd->cr_base) {
+ dd_dev_err(dd, "Unable to allocate credit return base\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+ for (i = 0; i < num_numa; i++) {
+ int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
+
+ set_dev_node(&dd->pcidev->dev, i);
+ dd->cr_base[i].va = dma_zalloc_coherent(
+ &dd->pcidev->dev,
+ bytes,
+ &dd->cr_base[i].pa,
+ GFP_KERNEL);
+ if (!dd->cr_base[i].va) {
+ set_dev_node(&dd->pcidev->dev, dd->node);
+ dd_dev_err(dd,
+ "Unable to allocate credit return DMA range for NUMA %d\n",
+ i);
+ ret = -ENOMEM;
+ goto done;
+ }
+ }
+ set_dev_node(&dd->pcidev->dev, dd->node);
+
+ ret = 0;
+done:
+ return ret;
+}
+
+void free_credit_return(struct hfi1_devdata *dd)
+{
+ int num_numa;
+ int i;
+
+ if (!dd->cr_base)
+ return;
+
+ num_numa = num_online_nodes();
+ for (i = 0; i < num_numa; i++) {
+ if (dd->cr_base[i].va) {
+ dma_free_coherent(&dd->pcidev->dev,
+ TXE_NUM_CONTEXTS *
+ sizeof(struct credit_return),
+ dd->cr_base[i].va,
+ dd->cr_base[i].pa);
+ }
+ }
+ kfree(dd->cr_base);
+ dd->cr_base = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
new file mode 100644
index 000000000000..464cbd27b975
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -0,0 +1,328 @@
+#ifndef _PIO_H
+#define _PIO_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/* send context types */
+#define SC_KERNEL 0
+#define SC_VL15 1
+#define SC_ACK 2
+#define SC_USER 3 /* must be the last one: it may take all left */
+#define SC_MAX 4 /* count of send context types */
+
+/* invalid send context index */
+#define INVALID_SCI 0xff
+
+/* PIO buffer release callback function */
+typedef void (*pio_release_cb)(void *arg, int code);
+
+/* PIO release codes - in bits, as there could more than one that apply */
+#define PRC_OK 0 /* no known error */
+#define PRC_STATUS_ERR 0x01 /* credit return due to status error */
+#define PRC_PBC 0x02 /* credit return due to PBC */
+#define PRC_THRESHOLD 0x04 /* credit return due to threshold */
+#define PRC_FILL_ERR 0x08 /* credit return due fill error */
+#define PRC_FORCE 0x10 /* credit return due credit force */
+#define PRC_SC_DISABLE 0x20 /* clean-up after a context disable */
+
+/* byte helper */
+union mix {
+ u64 val64;
+ u32 val32[2];
+ u8 val8[8];
+};
+
+/* an allocated PIO buffer */
+struct pio_buf {
+ struct send_context *sc;/* back pointer to owning send context */
+ pio_release_cb cb; /* called when the buffer is released */
+ void *arg; /* argument for cb */
+ void __iomem *start; /* buffer start address */
+ void __iomem *end; /* context end address */
+ unsigned long size; /* context size, in bytes */
+ unsigned long sent_at; /* buffer is sent when <= free */
+ u32 block_count; /* size of buffer, in blocks */
+ u32 qw_written; /* QW written so far */
+ u32 carry_bytes; /* number of valid bytes in carry */
+ union mix carry; /* pending unwritten bytes */
+};
+
+/* cache line aligned pio buffer array */
+union pio_shadow_ring {
+ struct pio_buf pbuf;
+ u64 unused[16]; /* cache line spacer */
+} ____cacheline_aligned;
+
+/* per-NUMA send context */
+struct send_context {
+ /* read-only after init */
+ struct hfi1_devdata *dd; /* device */
+ void __iomem *base_addr; /* start of PIO memory */
+ union pio_shadow_ring *sr; /* shadow ring */
+
+ volatile __le64 *hw_free; /* HW free counter */
+ struct work_struct halt_work; /* halted context work queue entry */
+ unsigned long flags; /* flags */
+ int node; /* context home node */
+ int type; /* context type */
+ u32 sw_index; /* software index number */
+ u32 hw_context; /* hardware context number */
+ u32 credits; /* number of blocks in context */
+ u32 sr_size; /* size of the shadow ring */
+ u32 group; /* credit return group */
+ /* allocator fields */
+ spinlock_t alloc_lock ____cacheline_aligned_in_smp;
+ unsigned long fill; /* official alloc count */
+ unsigned long alloc_free; /* copy of free (less cache thrash) */
+ u32 sr_head; /* shadow ring head */
+ /* releaser fields */
+ spinlock_t release_lock ____cacheline_aligned_in_smp;
+ unsigned long free; /* official free count */
+ u32 sr_tail; /* shadow ring tail */
+ /* list for PIO waiters */
+ struct list_head piowait ____cacheline_aligned_in_smp;
+ spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
+ u64 credit_ctrl; /* cache for credit control */
+ u32 credit_intr_count; /* count of credit intr users */
+ u32 __percpu *buffers_allocated;/* count of buffers allocated */
+ wait_queue_head_t halt_wait; /* wait until kernel sees interrupt */
+};
+
+/* send context flags */
+#define SCF_ENABLED 0x01
+#define SCF_IN_FREE 0x02
+#define SCF_HALTED 0x04
+#define SCF_FROZEN 0x08
+
+struct send_context_info {
+ struct send_context *sc; /* allocated working context */
+ u16 allocated; /* has this been allocated? */
+ u16 type; /* context type */
+ u16 base; /* base in PIO array */
+ u16 credits; /* size in PIO array */
+};
+
+/* DMA credit return, index is always (context & 0x7) */
+struct credit_return {
+ volatile __le64 cr[8];
+};
+
+/* NUMA indexed credit return array */
+struct credit_return_base {
+ struct credit_return *va;
+ dma_addr_t pa;
+};
+
+/* send context configuration sizes (one per type) */
+struct sc_config_sizes {
+ short int size;
+ short int count;
+};
+
+/*
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform send contexts per vl, the
+ * number of send contexts for a vl is either the vl_scontexts[vl] or
+ * a computation based on num_kernel_send_contexts/num_vls:
+ *
+ * For example:
+ * nactual = vl_scontexts ? vl_scontexts[vl] : num_kernel_send_contexts/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_kernel_send_contexts/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the send contexts are assigned
+ * in a round robin fashion wrapping back to the first send context
+ * for a particular vl.
+ *
+ * dd->pio_map
+ * | pio_map_elem[0]
+ * | +--------------------+
+ * v | mask |
+ * pio_vl_map |--------------------|
+ * +--------------------------+ | ksc[0] -> sc 1 |
+ * | list (RCU) | |--------------------|
+ * |--------------------------| ->| ksc[1] -> sc 2 |
+ * | mask | --/ |--------------------|
+ * |--------------------------| -/ | * |
+ * | actual_vls (max 8) | -/ |--------------------|
+ * |--------------------------| --/ | ksc[n] -> sc n |
+ * | vls (max 8) | -/ +--------------------+
+ * |--------------------------| --/
+ * | map[0] |-/
+ * |--------------------------| +--------------------+
+ * | map[1] |--- | mask |
+ * |--------------------------| \---- |--------------------|
+ * | * | \-- | ksc[0] -> sc 1+n |
+ * | * | \---- |--------------------|
+ * | * | \->| ksc[1] -> sc 2+n |
+ * |--------------------------| |--------------------|
+ * | map[vls - 1] |- | * |
+ * +--------------------------+ \- |--------------------|
+ * \- | ksc[m] -> sc m+n |
+ * \ +--------------------+
+ * \-
+ * \
+ * \- +--------------------+
+ * \- | mask |
+ * \ |--------------------|
+ * \- | ksc[0] -> sc 1+m+n |
+ * \- |--------------------|
+ * >| ksc[1] -> sc 2+m+n |
+ * |--------------------|
+ * | * |
+ * |--------------------|
+ * | ksc[o] -> sc o+m+n |
+ * +--------------------+
+ *
+ */
+
+/* Initial number of send contexts per VL */
+#define INIT_SC_PER_VL 2
+
+/*
+ * struct pio_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @ksc - array of kernel send contexts for this vl
+ *
+ * The mask is used to "mod" the selector to
+ * produce index into the trailing array of
+ * kscs
+ */
+struct pio_map_elem {
+ u32 mask;
+ struct send_context *ksc[0];
+};
+
+/*
+ * struct pio_vl_map - mapping for a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - numbers of vls rounded to next power of 2
+ * @map - array of pio_map_elem entries
+ *
+ * This is the parent mapping structure. The trailing members of the
+ * struct point to pio_map_elem entries, which in turn point to an
+ * array of kscs for that vl.
+ */
+struct pio_vl_map {
+ struct rcu_head list;
+ u32 mask;
+ u8 actual_vls;
+ u8 vls;
+ struct pio_map_elem *map[0];
+};
+
+int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls,
+ u8 *vl_scontexts);
+void free_pio_map(struct hfi1_devdata *dd);
+struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd,
+ u32 selector, u8 vl);
+struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd,
+ u32 selector, u8 sc5);
+
+/* send context functions */
+int init_credit_return(struct hfi1_devdata *dd);
+void free_credit_return(struct hfi1_devdata *dd);
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
+int init_send_contexts(struct hfi1_devdata *dd);
+int init_credit_return(struct hfi1_devdata *dd);
+int init_pervl_scs(struct hfi1_devdata *dd);
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+ uint hdrqentsize, int numa);
+void sc_free(struct send_context *sc);
+int sc_enable(struct send_context *sc);
+void sc_disable(struct send_context *sc);
+int sc_restart(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_flush(struct send_context *sc);
+void sc_drop(struct send_context *sc);
+void sc_stop(struct send_context *sc, int bit);
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+ pio_release_cb cb, void *arg);
+void sc_release_update(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
+void sc_add_credit_return_intr(struct send_context *sc);
+void sc_del_credit_return_intr(struct send_context *sc);
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_percent_to_threshold(struct send_context *sc, u32 percent);
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
+void sc_wait(struct hfi1_devdata *dd);
+void set_pio_integrity(struct send_context *sc);
+
+/* support functions */
+void pio_reset_all(struct hfi1_devdata *dd);
+void pio_freeze(struct hfi1_devdata *dd);
+void pio_kernel_unfreeze(struct hfi1_devdata *dd);
+
+/* global PIO send control operations */
+#define PSC_GLOBAL_ENABLE 0
+#define PSC_GLOBAL_DISABLE 1
+#define PSC_GLOBAL_VLARB_ENABLE 2
+#define PSC_GLOBAL_VLARB_DISABLE 3
+#define PSC_CM_RESET 4
+#define PSC_DATA_VL_ENABLE 5
+#define PSC_DATA_VL_DISABLE 6
+
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
+void pio_send_control(struct hfi1_devdata *dd, int op);
+
+/* PIO copy routines */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+ const void *from, size_t count);
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+ const void *from, size_t nbytes);
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
+void seg_pio_copy_end(struct pio_buf *pbuf);
+
+#endif /* _PIO_H */
diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c
new file mode 100644
index 000000000000..3a1ef3056282
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pio_copy.c
@@ -0,0 +1,879 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/* additive distance between non-SOP and SOP space */
+#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
+#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1)
+/* number of QUADWORDs in a block */
+#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64))
+
+/**
+ * pio_copy - copy data block to MMIO space
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @pbc: PBC to send
+ * @from: source, must be 8 byte aligned
+ * @count: number of DWORD (32-bit) quantities to copy from source
+ *
+ * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
+ * Must always write full BLOCK_SIZE bytes blocks. The first block must
+ * be written to the corresponding SOP=1 address.
+ *
+ * Known:
+ * o pbuf->start always starts on a block boundary
+ * o pbuf can wrap only at a block boundary
+ */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+ const void *from, size_t count)
+{
+ void __iomem *dest = pbuf->start + SOP_DISTANCE;
+ void __iomem *send = dest + PIO_BLOCK_SIZE;
+ void __iomem *dend; /* 8-byte data end */
+
+ /* write the PBC */
+ writeq(pbc, dest);
+ dest += sizeof(u64);
+
+ /* calculate where the QWORD data ends - in SOP=1 space */
+ dend = dest + ((count >> 1) * sizeof(u64));
+
+ if (dend < send) {
+ /*
+ * all QWORD data is within the SOP block, does *not*
+ * reach the end of the SOP block
+ */
+
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ /*
+ * No boundary checks are needed here:
+ * 0. We're not on the SOP block boundary
+ * 1. The possible DWORD dangle will still be within
+ * the SOP block
+ * 2. We cannot wrap except on a block boundary.
+ */
+ } else {
+ /* QWORD data extends _to_ or beyond the SOP block */
+
+ /* write 8-byte SOP chunk data */
+ while (dest < send) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ /* drop out of the SOP range */
+ dest -= SOP_DISTANCE;
+ dend -= SOP_DISTANCE;
+
+ /*
+ * If the wrap comes before or matches the data end,
+ * copy until until the wrap, then wrap.
+ *
+ * If the data ends at the end of the SOP above and
+ * the buffer wraps, then pbuf->end == dend == dest
+ * and nothing will get written, but we will wrap in
+ * case there is a dangling DWORD.
+ */
+ if (pbuf->end <= dend) {
+ while (dest < pbuf->end) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ dest -= pbuf->size;
+ dend -= pbuf->size;
+ }
+
+ /* write 8-byte non-SOP, non-wrap chunk data */
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ }
+ /* at this point we have wrapped if we are going to wrap */
+
+ /* write dangling u32, if any */
+ if (count & 1) {
+ union mix val;
+
+ val.val64 = 0;
+ val.val32[0] = *(u32 *)from;
+ writeq(val.val64, dest);
+ dest += sizeof(u64);
+ }
+ /*
+ * fill in rest of block, no need to check pbuf->end
+ * as we only wrap on a block boundary
+ */
+ while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+ writeq(0, dest);
+ dest += sizeof(u64);
+ }
+
+ /* finished with this buffer */
+ this_cpu_dec(*pbuf->sc->buffers_allocated);
+ preempt_enable();
+}
+
+/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
+#define USE_SHIFTS 1
+#ifdef USE_SHIFTS
+/*
+ * Handle carry bytes using shifts and masks.
+ *
+ * NOTE: the value the unused portion of carry is expected to always be zero.
+ */
+
+/*
+ * "zero" shift - bit shift used to zero out upper bytes. Input is
+ * the count of LSB bytes to preserve.
+ */
+#define zshift(x) (8 * (8 - (x)))
+
+/*
+ * "merge" shift - bit shift used to merge with carry bytes. Input is
+ * the LSB byte count to move beyond.
+ */
+#define mshift(x) (8 * (x))
+
+/*
+ * Read nbytes bytes from "from" and return them in the LSB bytes
+ * of pbuf->carry. Other bytes are zeroed. Any previous value
+ * pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned
+ * o nbytes must not span a QW boundary
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+ unsigned int nbytes)
+{
+ unsigned long off;
+
+ if (nbytes == 0) {
+ pbuf->carry.val64 = 0;
+ } else {
+ /* align our pointer */
+ off = (unsigned long)from & 0x7;
+ from = (void *)((unsigned long)from & ~0x7l);
+ pbuf->carry.val64 = ((*(u64 *)from)
+ << zshift(nbytes + off))/* zero upper bytes */
+ >> zshift(nbytes); /* place at bottom */
+ }
+ pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the next significant bytes
+ * of pbuf->carry. Unused bytes are zeroed. It is expected that the extra
+ * read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+ const void *from, unsigned int nbytes)
+{
+ unsigned long off = (unsigned long)from & 0x7;
+ unsigned int room, xbytes;
+
+ /* align our pointer */
+ from = (void *)((unsigned long)from & ~0x7l);
+
+ /* check count first - don't read anything if count is zero */
+ while (nbytes) {
+ /* find the number of bytes in this u64 */
+ room = 8 - off; /* this u64 has room for this many bytes */
+ xbytes = min(room, nbytes);
+
+ /*
+ * shift down to zero lower bytes, shift up to zero upper
+ * bytes, shift back down to move into place
+ */
+ pbuf->carry.val64 |= (((*(u64 *)from)
+ >> mshift(off))
+ << zshift(xbytes))
+ >> zshift(xbytes + pbuf->carry_bytes);
+ off = 0;
+ pbuf->carry_bytes += xbytes;
+ nbytes -= xbytes;
+ from += sizeof(u64);
+ }
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+ unsigned int remaining;
+
+ if (zbytes == 0) /* nothing to do */
+ return;
+
+ remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */
+
+ /* NOTE: zshift only guaranteed to work if remaining != 0 */
+ if (remaining)
+ pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining))
+ >> zshift(remaining);
+ else
+ pbuf->carry.val64 = 0;
+ pbuf->carry_bytes = remaining;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the LSB bytes of
+ * pbuf->carry with the upper bytes zeroed..
+ *
+ * NOTES:
+ * o result must keep unused bytes zeroed
+ * o src must be u64 aligned
+ */
+static inline void merge_write8(
+ struct pio_buf *pbuf,
+ void __iomem *dest,
+ const void *src)
+{
+ u64 new, temp;
+
+ new = *(u64 *)src;
+ temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
+ writeq(temp, dest);
+ pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void __iomem *dest)
+{
+ writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry. If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
+{
+ if (pbuf->carry_bytes) {
+ /* unused bytes are always kept zeroed, so just write */
+ writeq(pbuf->carry.val64, dest);
+ return 1;
+ }
+
+ return 0;
+}
+
+#else /* USE_SHIFTS */
+/*
+ * Handle carry bytes using byte copies.
+ *
+ * NOTE: the value the unused portion of carry is left uninitialized.
+ */
+
+/*
+ * Jump copy - no-loop copy for < 8 bytes.
+ */
+static inline void jcopy(u8 *dest, const u8 *src, u32 n)
+{
+ switch (n) {
+ case 7:
+ *dest++ = *src++;
+ case 6:
+ *dest++ = *src++;
+ case 5:
+ *dest++ = *src++;
+ case 4:
+ *dest++ = *src++;
+ case 3:
+ *dest++ = *src++;
+ case 2:
+ *dest++ = *src++;
+ case 1:
+ *dest++ = *src++;
+ }
+}
+
+/*
+ * Read nbytes from "from" and and place them in the low bytes
+ * of pbuf->carry. Other bytes are left as-is. Any previous
+ * value in pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned.
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+ unsigned int nbytes)
+{
+ jcopy(&pbuf->carry.val8[0], from, nbytes);
+ pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
+ * It is expected that the extra read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+ const void *from, unsigned int nbytes)
+{
+ jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
+ pbuf->carry_bytes += nbytes;
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * We do not care about the value of unused bytes in carry, so just
+ * reduce the byte count.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+ pbuf->carry_bytes -= zbytes;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the low bytes of
+ * pbuf->carry.
+ */
+static inline void merge_write8(
+ struct pio_buf *pbuf,
+ void *dest,
+ const void *src)
+{
+ u32 remainder = 8 - pbuf->carry_bytes;
+
+ jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
+ writeq(pbuf->carry.val64, dest);
+ jcopy(&pbuf->carry.val8[0], src + remainder, pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void *dest)
+{
+ writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry. If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void *dest)
+{
+ if (pbuf->carry_bytes) {
+ u64 zero = 0;
+
+ jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
+ 8 - pbuf->carry_bytes);
+ writeq(pbuf->carry.val64, dest);
+ return 1;
+ }
+
+ return 0;
+}
+#endif /* USE_SHIFTS */
+
+/*
+ * Segmented PIO Copy - start
+ *
+ * Start a PIO copy.
+ *
+ * @pbuf: destination buffer
+ * @pbc: the PBC for the PIO buffer
+ * @from: data source, QWORD aligned
+ * @nbytes: bytes to copy
+ */
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+ const void *from, size_t nbytes)
+{
+ void __iomem *dest = pbuf->start + SOP_DISTANCE;
+ void __iomem *send = dest + PIO_BLOCK_SIZE;
+ void __iomem *dend; /* 8-byte data end */
+
+ writeq(pbc, dest);
+ dest += sizeof(u64);
+
+ /* calculate where the QWORD data ends - in SOP=1 space */
+ dend = dest + ((nbytes >> 3) * sizeof(u64));
+
+ if (dend < send) {
+ /*
+ * all QWORD data is within the SOP block, does *not*
+ * reach the end of the SOP block
+ */
+
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ /*
+ * No boundary checks are needed here:
+ * 0. We're not on the SOP block boundary
+ * 1. The possible DWORD dangle will still be within
+ * the SOP block
+ * 2. We cannot wrap except on a block boundary.
+ */
+ } else {
+ /* QWORD data extends _to_ or beyond the SOP block */
+
+ /* write 8-byte SOP chunk data */
+ while (dest < send) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ /* drop out of the SOP range */
+ dest -= SOP_DISTANCE;
+ dend -= SOP_DISTANCE;
+
+ /*
+ * If the wrap comes before or matches the data end,
+ * copy until until the wrap, then wrap.
+ *
+ * If the data ends at the end of the SOP above and
+ * the buffer wraps, then pbuf->end == dend == dest
+ * and nothing will get written, but we will wrap in
+ * case there is a dangling DWORD.
+ */
+ if (pbuf->end <= dend) {
+ while (dest < pbuf->end) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ dest -= pbuf->size;
+ dend -= pbuf->size;
+ }
+
+ /* write 8-byte non-SOP, non-wrap chunk data */
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+ }
+ /* at this point we have wrapped if we are going to wrap */
+
+ /* ...but it doesn't matter as we're done writing */
+
+ /* save dangling bytes, if any */
+ read_low_bytes(pbuf, from, nbytes & 0x7);
+
+ pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
+}
+
+/*
+ * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
+ * bytes are non-zero.
+ *
+ * Whole u64s must be written to the chip, so bytes must be manually merged.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned.
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+ void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+ void __iomem *dend; /* 8-byte data end */
+ unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3;
+ unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7;
+
+ /* calculate 8-byte data end */
+ dend = dest + (qw_to_write * sizeof(u64));
+
+ if (pbuf->qw_written < PIO_BLOCK_QWS) {
+ /*
+ * Still within SOP block. We don't need to check for
+ * wrap because we are still in the first block and
+ * can only wrap on block boundaries.
+ */
+ void __iomem *send; /* SOP end */
+ void __iomem *xend;
+
+ /*
+ * calculate the end of data or end of block, whichever
+ * comes first
+ */
+ send = pbuf->start + PIO_BLOCK_SIZE;
+ xend = min(send, dend);
+
+ /* shift up to SOP=1 space */
+ dest += SOP_DISTANCE;
+ xend += SOP_DISTANCE;
+
+ /* write 8-byte chunk data */
+ while (dest < xend) {
+ merge_write8(pbuf, dest, from);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ /* shift down to SOP=0 space */
+ dest -= SOP_DISTANCE;
+ }
+ /*
+ * At this point dest could be (either, both, or neither):
+ * - at dend
+ * - at the wrap
+ */
+
+ /*
+ * If the wrap comes before or matches the data end,
+ * copy until until the wrap, then wrap.
+ *
+ * If dest is at the wrap, we will fall into the if,
+ * not do the loop, when wrap.
+ *
+ * If the data ends at the end of the SOP above and
+ * the buffer wraps, then pbuf->end == dend == dest
+ * and nothing will get written.
+ */
+ if (pbuf->end <= dend) {
+ while (dest < pbuf->end) {
+ merge_write8(pbuf, dest, from);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ dest -= pbuf->size;
+ dend -= pbuf->size;
+ }
+
+ /* write 8-byte non-SOP, non-wrap chunk data */
+ while (dest < dend) {
+ merge_write8(pbuf, dest, from);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ /* adjust carry */
+ if (pbuf->carry_bytes < bytes_left) {
+ /* need to read more */
+ read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes);
+ } else {
+ /* remove invalid bytes */
+ zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left);
+ }
+
+ pbuf->qw_written += qw_to_write;
+}
+
+/*
+ * Mid copy helper, "straight case" - source pointer is 64-bit aligned
+ * with no carry bytes.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_straight(struct pio_buf *pbuf,
+ const void *from, size_t nbytes)
+{
+ void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+ void __iomem *dend; /* 8-byte data end */
+
+ /* calculate 8-byte data end */
+ dend = dest + ((nbytes >> 3) * sizeof(u64));
+
+ if (pbuf->qw_written < PIO_BLOCK_QWS) {
+ /*
+ * Still within SOP block. We don't need to check for
+ * wrap because we are still in the first block and
+ * can only wrap on block boundaries.
+ */
+ void __iomem *send; /* SOP end */
+ void __iomem *xend;
+
+ /*
+ * calculate the end of data or end of block, whichever
+ * comes first
+ */
+ send = pbuf->start + PIO_BLOCK_SIZE;
+ xend = min(send, dend);
+
+ /* shift up to SOP=1 space */
+ dest += SOP_DISTANCE;
+ xend += SOP_DISTANCE;
+
+ /* write 8-byte chunk data */
+ while (dest < xend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ /* shift down to SOP=0 space */
+ dest -= SOP_DISTANCE;
+ }
+ /*
+ * At this point dest could be (either, both, or neither):
+ * - at dend
+ * - at the wrap
+ */
+
+ /*
+ * If the wrap comes before or matches the data end,
+ * copy until until the wrap, then wrap.
+ *
+ * If dest is at the wrap, we will fall into the if,
+ * not do the loop, when wrap.
+ *
+ * If the data ends at the end of the SOP above and
+ * the buffer wraps, then pbuf->end == dend == dest
+ * and nothing will get written.
+ */
+ if (pbuf->end <= dend) {
+ while (dest < pbuf->end) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ dest -= pbuf->size;
+ dend -= pbuf->size;
+ }
+
+ /* write 8-byte non-SOP, non-wrap chunk data */
+ while (dest < dend) {
+ writeq(*(u64 *)from, dest);
+ from += sizeof(u64);
+ dest += sizeof(u64);
+ }
+
+ /* we know carry_bytes was zero on entry to this routine */
+ read_low_bytes(pbuf, from, nbytes & 0x7);
+
+ pbuf->qw_written += nbytes >> 3;
+}
+
+/*
+ * Segmented PIO Copy - middle
+ *
+ * Must handle any aligned tail and any aligned source with any byte count.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @from: data source
+ * @nbytes: number of bytes to copy
+ */
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+ unsigned long from_align = (unsigned long)from & 0x7;
+
+ if (pbuf->carry_bytes + nbytes < 8) {
+ /* not enough bytes to fill a QW */
+ read_extra_bytes(pbuf, from, nbytes);
+ return;
+ }
+
+ if (from_align) {
+ /* misaligned source pointer - align it */
+ unsigned long to_align;
+
+ /* bytes to read to align "from" */
+ to_align = 8 - from_align;
+
+ /*
+ * In the advance-to-alignment logic below, we do not need
+ * to check if we are using more than nbytes. This is because
+ * if we are here, we already know that carry+nbytes will
+ * fill at least one QW.
+ */
+ if (pbuf->carry_bytes + to_align < 8) {
+ /* not enough align bytes to fill a QW */
+ read_extra_bytes(pbuf, from, to_align);
+ from += to_align;
+ nbytes -= to_align;
+ } else {
+ /* bytes to fill carry */
+ unsigned long to_fill = 8 - pbuf->carry_bytes;
+ /* bytes left over to be read */
+ unsigned long extra = to_align - to_fill;
+ void __iomem *dest;
+
+ /* fill carry... */
+ read_extra_bytes(pbuf, from, to_fill);
+ from += to_fill;
+ nbytes -= to_fill;
+ /* may not be enough valid bytes left to align */
+ if (extra > nbytes)
+ extra = nbytes;
+
+ /* ...now write carry */
+ dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+ /*
+ * The two checks immediately below cannot both be
+ * true, hence the else. If we have wrapped, we
+ * cannot still be within the first block.
+ * Conversely, if we are still in the first block, we
+ * cannot have wrapped. We do the wrap check first
+ * as that is more likely.
+ */
+ /* adjust if we've wrapped */
+ if (dest >= pbuf->end)
+ dest -= pbuf->size;
+ /* jump to SOP range if within the first block */
+ else if (pbuf->qw_written < PIO_BLOCK_QWS)
+ dest += SOP_DISTANCE;
+
+ carry8_write8(pbuf->carry, dest);
+ pbuf->qw_written++;
+
+ /* read any extra bytes to do final alignment */
+ /* this will overwrite anything in pbuf->carry */
+ read_low_bytes(pbuf, from, extra);
+ from += extra;
+ nbytes -= extra;
+ /*
+ * If no bytes are left, return early - we are done.
+ * NOTE: This short-circuit is *required* because
+ * "extra" may have been reduced in size and "from"
+ * is not aligned, as required when leaving this
+ * if block.
+ */
+ if (nbytes == 0)
+ return;
+ }
+
+ /* at this point, from is QW aligned */
+ }
+
+ if (pbuf->carry_bytes)
+ mid_copy_mix(pbuf, from, nbytes);
+ else
+ mid_copy_straight(pbuf, from, nbytes);
+}
+
+/*
+ * Segmented PIO Copy - end
+ *
+ * Write any remainder (in pbuf->carry) and finish writing the whole block.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ */
+void seg_pio_copy_end(struct pio_buf *pbuf)
+{
+ void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+ /*
+ * The two checks immediately below cannot both be true, hence the
+ * else. If we have wrapped, we cannot still be within the first
+ * block. Conversely, if we are still in the first block, we
+ * cannot have wrapped. We do the wrap check first as that is
+ * more likely.
+ */
+ /* adjust if we have wrapped */
+ if (dest >= pbuf->end)
+ dest -= pbuf->size;
+ /* jump to the SOP range if within the first block */
+ else if (pbuf->qw_written < PIO_BLOCK_QWS)
+ dest += SOP_DISTANCE;
+
+ /* write final bytes, if any */
+ if (carry_write8(pbuf, dest)) {
+ dest += sizeof(u64);
+ /*
+ * NOTE: We do not need to recalculate whether dest needs
+ * SOP_DISTANCE or not.
+ *
+ * If we are in the first block and the dangle write
+ * keeps us in the same block, dest will need
+ * to retain SOP_DISTANCE in the loop below.
+ *
+ * If we are in the first block and the dangle write pushes
+ * us to the next block, then loop below will not run
+ * and dest is not used. Hence we do not need to update
+ * it.
+ *
+ * If we are past the first block, then SOP_DISTANCE
+ * was never added, so there is nothing to do.
+ */
+ }
+
+ /* fill in rest of block */
+ while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+ writeq(0, dest);
+ dest += sizeof(u64);
+ }
+
+ /* finished with this buffer */
+ this_cpu_dec(*pbuf->sc->buffers_allocated);
+ preempt_enable();
+}
diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c
new file mode 100644
index 000000000000..965c8aef0c60
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/platform.c
@@ -0,0 +1,899 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "efivar.h"
+
+void get_platform_config(struct hfi1_devdata *dd)
+{
+ int ret = 0;
+ unsigned long size = 0;
+ u8 *temp_platform_config = NULL;
+
+ ret = read_hfi1_efi_var(dd, "configuration", &size,
+ (void **)&temp_platform_config);
+ if (ret) {
+ dd_dev_info(dd,
+ "%s: Failed to get platform config from UEFI, falling back to request firmware\n",
+ __func__);
+ /* fall back to request firmware */
+ platform_config_load = 1;
+ goto bail;
+ }
+
+ dd->platform_config.data = temp_platform_config;
+ dd->platform_config.size = size;
+
+bail:
+ /* exit */;
+}
+
+void free_platform_config(struct hfi1_devdata *dd)
+{
+ if (!platform_config_load) {
+ /*
+ * was loaded from EFI, release memory
+ * allocated by read_efi_var
+ */
+ kfree(dd->platform_config.data);
+ }
+ /*
+ * else do nothing, dispose_firmware will release
+ * struct firmware platform_config on driver exit
+ */
+}
+
+void get_port_type(struct hfi1_pportdata *ppd)
+{
+ int ret;
+
+ ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_PORT_TYPE, &ppd->port_type,
+ 4);
+ if (ret)
+ ppd->port_type = PORT_TYPE_UNKNOWN;
+}
+
+int set_qsfp_tx(struct hfi1_pportdata *ppd, int on)
+{
+ u8 tx_ctrl_byte = on ? 0x0 : 0xF;
+ int ret = 0;
+
+ ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_TX_CTRL_BYTE_OFFS,
+ &tx_ctrl_byte, 1);
+ /* we expected 1, so consider 0 an error */
+ if (ret == 0)
+ ret = -EIO;
+ else if (ret == 1)
+ ret = 0;
+ return ret;
+}
+
+static int qual_power(struct hfi1_pportdata *ppd)
+{
+ u32 cable_power_class = 0, power_class_max = 0;
+ u8 *cache = ppd->qsfp_info.cache;
+ int ret = 0;
+
+ ret = get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_SYSTEM_TABLE, 0,
+ SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, &power_class_max, 4);
+ if (ret)
+ return ret;
+
+ cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+ if (cable_power_class > power_class_max)
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY);
+
+ if (ppd->offline_disabled_reason ==
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) {
+ dd_dev_info(
+ ppd->dd,
+ "%s: Port disabled due to system power restrictions\n",
+ __func__);
+ ret = -EPERM;
+ }
+ return ret;
+}
+
+static int qual_bitrate(struct hfi1_pportdata *ppd)
+{
+ u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+ u8 *cache = ppd->qsfp_info.cache;
+
+ if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G) &&
+ cache[QSFP_NOM_BIT_RATE_250_OFFS] < 0x64)
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
+
+ if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G) &&
+ cache[QSFP_NOM_BIT_RATE_100_OFFS] < 0x7D)
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY);
+
+ if (ppd->offline_disabled_reason ==
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) {
+ dd_dev_info(
+ ppd->dd,
+ "%s: Cable failed bitrate check, disabling port\n",
+ __func__);
+ return -EPERM;
+ }
+ return 0;
+}
+
+static int set_qsfp_high_power(struct hfi1_pportdata *ppd)
+{
+ u8 cable_power_class = 0, power_ctrl_byte = 0;
+ u8 *cache = ppd->qsfp_info.cache;
+ int ret;
+
+ cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+ if (cable_power_class > QSFP_POWER_CLASS_1) {
+ power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS];
+
+ power_ctrl_byte |= 1;
+ power_ctrl_byte &= ~(0x2);
+
+ ret = qsfp_write(ppd, ppd->dd->hfi1_id,
+ QSFP_PWR_CTRL_BYTE_OFFS,
+ &power_ctrl_byte, 1);
+ if (ret != 1)
+ return -EIO;
+
+ if (cable_power_class > QSFP_POWER_CLASS_4) {
+ power_ctrl_byte |= (1 << 2);
+ ret = qsfp_write(ppd, ppd->dd->hfi1_id,
+ QSFP_PWR_CTRL_BYTE_OFFS,
+ &power_ctrl_byte, 1);
+ if (ret != 1)
+ return -EIO;
+ }
+
+ /* SFF 8679 rev 1.7 LPMode Deassert time */
+ msleep(300);
+ }
+ return 0;
+}
+
+static void apply_rx_cdr(struct hfi1_pportdata *ppd,
+ u32 rx_preset_index,
+ u8 *cdr_ctrl_byte)
+{
+ u32 rx_preset;
+ u8 *cache = ppd->qsfp_info.cache;
+ int cable_power_class;
+
+ if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) &&
+ (cache[QSFP_CDR_INFO_OFFS] & 0x40)))
+ return;
+
+ /* RX CDR present, bypass supported */
+ cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+ if (cable_power_class <= QSFP_POWER_CLASS_3) {
+ /* Power class <= 3, ignore config & turn RX CDR on */
+ *cdr_ctrl_byte |= 0xF;
+ return;
+ }
+
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+ rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+ &rx_preset, 4);
+
+ if (!rx_preset) {
+ dd_dev_info(
+ ppd->dd,
+ "%s: RX_CDR_APPLY is set to disabled\n",
+ __func__);
+ return;
+ }
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+ rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR,
+ &rx_preset, 4);
+
+ /* Expand cdr setting to all 4 lanes */
+ rx_preset = (rx_preset | (rx_preset << 1) |
+ (rx_preset << 2) | (rx_preset << 3));
+
+ if (rx_preset) {
+ *cdr_ctrl_byte |= rx_preset;
+ } else {
+ *cdr_ctrl_byte &= rx_preset;
+ /* Preserve current TX CDR status */
+ *cdr_ctrl_byte |= (cache[QSFP_CDR_CTRL_BYTE_OFFS] & 0xF0);
+ }
+}
+
+static void apply_tx_cdr(struct hfi1_pportdata *ppd,
+ u32 tx_preset_index,
+ u8 *cdr_ctrl_byte)
+{
+ u32 tx_preset;
+ u8 *cache = ppd->qsfp_info.cache;
+ int cable_power_class;
+
+ if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) &&
+ (cache[QSFP_CDR_INFO_OFFS] & 0x80)))
+ return;
+
+ /* TX CDR present, bypass supported */
+ cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
+
+ if (cable_power_class <= QSFP_POWER_CLASS_3) {
+ /* Power class <= 3, ignore config & turn TX CDR on */
+ *cdr_ctrl_byte |= 0xF0;
+ return;
+ }
+
+ get_platform_config_field(
+ ppd->dd,
+ PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
+ TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, &tx_preset, 4);
+
+ if (!tx_preset) {
+ dd_dev_info(
+ ppd->dd,
+ "%s: TX_CDR_APPLY is set to disabled\n",
+ __func__);
+ return;
+ }
+ get_platform_config_field(
+ ppd->dd,
+ PLATFORM_CONFIG_TX_PRESET_TABLE,
+ tx_preset_index,
+ TX_PRESET_TABLE_QSFP_TX_CDR, &tx_preset, 4);
+
+ /* Expand cdr setting to all 4 lanes */
+ tx_preset = (tx_preset | (tx_preset << 1) |
+ (tx_preset << 2) | (tx_preset << 3));
+
+ if (tx_preset)
+ *cdr_ctrl_byte |= (tx_preset << 4);
+ else
+ /* Preserve current/determined RX CDR status */
+ *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF);
+}
+
+static void apply_cdr_settings(
+ struct hfi1_pportdata *ppd, u32 rx_preset_index,
+ u32 tx_preset_index)
+{
+ u8 *cache = ppd->qsfp_info.cache;
+ u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+
+ apply_rx_cdr(ppd, rx_preset_index, &cdr_ctrl_byte);
+
+ apply_tx_cdr(ppd, tx_preset_index, &cdr_ctrl_byte);
+
+ qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
+ &cdr_ctrl_byte, 1);
+}
+
+static void apply_tx_eq_auto(struct hfi1_pportdata *ppd)
+{
+ u8 *cache = ppd->qsfp_info.cache;
+ u8 tx_eq;
+
+ if (!(cache[QSFP_EQ_INFO_OFFS] & 0x8))
+ return;
+ /* Disable adaptive TX EQ if present */
+ tx_eq = cache[(128 * 3) + 241];
+ tx_eq &= 0xF0;
+ qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 241, &tx_eq, 1);
+}
+
+static void apply_tx_eq_prog(struct hfi1_pportdata *ppd, u32 tx_preset_index)
+{
+ u8 *cache = ppd->qsfp_info.cache;
+ u32 tx_preset;
+ u8 tx_eq;
+
+ if (!(cache[QSFP_EQ_INFO_OFFS] & 0x4))
+ return;
+
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+ tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+ &tx_preset, 4);
+ if (!tx_preset) {
+ dd_dev_info(
+ ppd->dd,
+ "%s: TX_EQ_APPLY is set to disabled\n",
+ __func__);
+ return;
+ }
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+ tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ,
+ &tx_preset, 4);
+
+ if (((cache[(128 * 3) + 224] & 0xF0) >> 4) < tx_preset) {
+ dd_dev_info(
+ ppd->dd,
+ "%s: TX EQ %x unsupported\n",
+ __func__, tx_preset);
+
+ dd_dev_info(
+ ppd->dd,
+ "%s: Applying EQ %x\n",
+ __func__, cache[608] & 0xF0);
+
+ tx_preset = (cache[608] & 0xF0) >> 4;
+ }
+
+ tx_eq = tx_preset | (tx_preset << 4);
+ qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 234, &tx_eq, 1);
+ qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 235, &tx_eq, 1);
+}
+
+static void apply_rx_eq_emp(struct hfi1_pportdata *ppd, u32 rx_preset_index)
+{
+ u32 rx_preset;
+ u8 rx_eq, *cache = ppd->qsfp_info.cache;
+
+ if (!(cache[QSFP_EQ_INFO_OFFS] & 0x2))
+ return;
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+ rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
+ &rx_preset, 4);
+
+ if (!rx_preset) {
+ dd_dev_info(
+ ppd->dd,
+ "%s: RX_EMP_APPLY is set to disabled\n",
+ __func__);
+ return;
+ }
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE,
+ rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP,
+ &rx_preset, 4);
+
+ if ((cache[(128 * 3) + 224] & 0xF) < rx_preset) {
+ dd_dev_info(
+ ppd->dd,
+ "%s: Requested RX EMP %x\n",
+ __func__, rx_preset);
+
+ dd_dev_info(
+ ppd->dd,
+ "%s: Applying supported EMP %x\n",
+ __func__, cache[608] & 0xF);
+
+ rx_preset = cache[608] & 0xF;
+ }
+
+ rx_eq = rx_preset | (rx_preset << 4);
+
+ qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 236, &rx_eq, 1);
+ qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 237, &rx_eq, 1);
+}
+
+static void apply_eq_settings(struct hfi1_pportdata *ppd,
+ u32 rx_preset_index, u32 tx_preset_index)
+{
+ u8 *cache = ppd->qsfp_info.cache;
+
+ /* no point going on w/o a page 3 */
+ if (cache[2] & 4) {
+ dd_dev_info(ppd->dd,
+ "%s: Upper page 03 not present\n",
+ __func__);
+ return;
+ }
+
+ apply_tx_eq_auto(ppd);
+
+ apply_tx_eq_prog(ppd, tx_preset_index);
+
+ apply_rx_eq_emp(ppd, rx_preset_index);
+}
+
+static void apply_rx_amplitude_settings(
+ struct hfi1_pportdata *ppd, u32 rx_preset_index,
+ u32 tx_preset_index)
+{
+ u32 rx_preset;
+ u8 rx_amp = 0, i = 0, preferred = 0, *cache = ppd->qsfp_info.cache;
+
+ /* no point going on w/o a page 3 */
+ if (cache[2] & 4) {
+ dd_dev_info(ppd->dd,
+ "%s: Upper page 03 not present\n",
+ __func__);
+ return;
+ }
+ if (!(cache[QSFP_EQ_INFO_OFFS] & 0x1)) {
+ dd_dev_info(ppd->dd,
+ "%s: RX_AMP_APPLY is set to disabled\n",
+ __func__);
+ return;
+ }
+
+ get_platform_config_field(ppd->dd,
+ PLATFORM_CONFIG_RX_PRESET_TABLE,
+ rx_preset_index,
+ RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+ &rx_preset, 4);
+
+ if (!rx_preset) {
+ dd_dev_info(ppd->dd,
+ "%s: RX_AMP_APPLY is set to disabled\n",
+ __func__);
+ return;
+ }
+ get_platform_config_field(ppd->dd,
+ PLATFORM_CONFIG_RX_PRESET_TABLE,
+ rx_preset_index,
+ RX_PRESET_TABLE_QSFP_RX_AMP,
+ &rx_preset, 4);
+
+ dd_dev_info(ppd->dd,
+ "%s: Requested RX AMP %x\n",
+ __func__,
+ rx_preset);
+
+ for (i = 0; i < 4; i++) {
+ if (cache[(128 * 3) + 225] & (1 << i)) {
+ preferred = i;
+ if (preferred == rx_preset)
+ break;
+ }
+ }
+
+ /*
+ * Verify that preferred RX amplitude is not just a
+ * fall through of the default
+ */
+ if (!preferred && !(cache[(128 * 3) + 225] & 0x1)) {
+ dd_dev_info(ppd->dd, "No supported RX AMP, not applying\n");
+ return;
+ }
+
+ dd_dev_info(ppd->dd,
+ "%s: Applying RX AMP %x\n", __func__, preferred);
+
+ rx_amp = preferred | (preferred << 4);
+ qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 238, &rx_amp, 1);
+ qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 239, &rx_amp, 1);
+}
+
+#define OPA_INVALID_INDEX 0xFFF
+
+static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id,
+ u32 config_data, const char *message)
+{
+ u8 i;
+ int ret = HCMD_SUCCESS;
+
+ for (i = 0; i < 4; i++) {
+ ret = load_8051_config(ppd->dd, field_id, i, config_data);
+ if (ret != HCMD_SUCCESS) {
+ dd_dev_err(
+ ppd->dd,
+ "%s: %s for lane %u failed\n",
+ message, __func__, i);
+ }
+ }
+}
+
+static void apply_tunings(
+ struct hfi1_pportdata *ppd, u32 tx_preset_index,
+ u8 tuning_method, u32 total_atten, u8 limiting_active)
+{
+ int ret = 0;
+ u32 config_data = 0, tx_preset = 0;
+ u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0;
+ u8 *cache = ppd->qsfp_info.cache;
+
+ /* Pass tuning method to 8051 */
+ read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
+ &config_data);
+ config_data &= ~(0xff << TUNING_METHOD_SHIFT);
+ config_data |= ((u32)tuning_method << TUNING_METHOD_SHIFT);
+ ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
+ config_data);
+ if (ret != HCMD_SUCCESS)
+ dd_dev_err(ppd->dd, "%s: Failed to set tuning method\n",
+ __func__);
+
+ /* Set same channel loss for both TX and RX */
+ config_data = 0 | (total_atten << 16) | (total_atten << 24);
+ apply_tx_lanes(ppd, CHANNEL_LOSS_SETTINGS, config_data,
+ "Setting channel loss");
+
+ /* Inform 8051 of cable capabilities */
+ if (ppd->qsfp_info.cache_valid) {
+ external_device_config =
+ ((cache[QSFP_MOD_PWR_OFFS] & 0x4) << 3) |
+ ((cache[QSFP_MOD_PWR_OFFS] & 0x8) << 2) |
+ ((cache[QSFP_EQ_INFO_OFFS] & 0x2) << 1) |
+ (cache[QSFP_EQ_INFO_OFFS] & 0x4);
+ ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
+ GENERAL_CONFIG, &config_data);
+ /* Clear, then set the external device config field */
+ config_data &= ~(u32)0xFF;
+ config_data |= external_device_config;
+ ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS,
+ GENERAL_CONFIG, config_data);
+ if (ret != HCMD_SUCCESS)
+ dd_dev_info(ppd->dd,
+ "%s: Failed set ext device config params\n",
+ __func__);
+ }
+
+ if (tx_preset_index == OPA_INVALID_INDEX) {
+ if (ppd->port_type == PORT_TYPE_QSFP && limiting_active)
+ dd_dev_info(ppd->dd, "%s: Invalid Tx preset index\n",
+ __func__);
+ return;
+ }
+
+ /* Following for limiting active channels only */
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index,
+ TX_PRESET_TABLE_PRECUR, &tx_preset, 4);
+ precur = tx_preset;
+
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+ tx_preset_index, TX_PRESET_TABLE_ATTN, &tx_preset, 4);
+ attn = tx_preset;
+
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE,
+ tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4);
+ postcur = tx_preset;
+
+ config_data = precur | (attn << 8) | (postcur << 16);
+
+ apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data,
+ "Applying TX settings");
+}
+
+/* Must be holding the QSFP i2c resource */
+static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
+ u32 *ptr_rx_preset, u32 *ptr_total_atten)
+{
+ int ret;
+ u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+ u8 *cache = ppd->qsfp_info.cache;
+
+ ppd->qsfp_info.limiting_active = 1;
+
+ ret = set_qsfp_tx(ppd, 0);
+ if (ret)
+ return ret;
+
+ ret = qual_power(ppd);
+ if (ret)
+ return ret;
+
+ ret = qual_bitrate(ppd);
+ if (ret)
+ return ret;
+
+ /*
+ * We'll change the QSFP memory contents from here on out, thus we set a
+ * flag here to remind ourselves to reset the QSFP module. This prevents
+ * reuse of stale settings established in our previous pass through.
+ */
+ if (ppd->qsfp_info.reset_needed) {
+ reset_qsfp(ppd);
+ refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+ } else {
+ ppd->qsfp_info.reset_needed = 1;
+ }
+
+ ret = set_qsfp_high_power(ppd);
+ if (ret)
+ return ret;
+
+ if (cache[QSFP_EQ_INFO_OFFS] & 0x4) {
+ ret = get_platform_config_field(
+ ppd->dd,
+ PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+ ptr_tx_preset, 4);
+ if (ret) {
+ *ptr_tx_preset = OPA_INVALID_INDEX;
+ return ret;
+ }
+ } else {
+ ret = get_platform_config_field(
+ ppd->dd,
+ PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+ ptr_tx_preset, 4);
+ if (ret) {
+ *ptr_tx_preset = OPA_INVALID_INDEX;
+ return ret;
+ }
+ }
+
+ ret = get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4);
+ if (ret) {
+ *ptr_rx_preset = OPA_INVALID_INDEX;
+ return ret;
+ }
+
+ if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_LOCAL_ATTEN_25G, ptr_total_atten, 4);
+ else if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G))
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_LOCAL_ATTEN_12G, ptr_total_atten, 4);
+
+ apply_cdr_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+ apply_eq_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+ apply_rx_amplitude_settings(ppd, *ptr_rx_preset, *ptr_tx_preset);
+
+ ret = set_qsfp_tx(ppd, 1);
+
+ return ret;
+}
+
+static int tune_qsfp(struct hfi1_pportdata *ppd,
+ u32 *ptr_tx_preset, u32 *ptr_rx_preset,
+ u8 *ptr_tuning_method, u32 *ptr_total_atten)
+{
+ u32 cable_atten = 0, remote_atten = 0, platform_atten = 0;
+ u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled;
+ int ret = 0;
+ u8 *cache = ppd->qsfp_info.cache;
+
+ switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
+ case 0xA ... 0xB:
+ ret = get_platform_config_field(
+ ppd->dd,
+ PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_LOCAL_ATTEN_25G,
+ &platform_atten, 4);
+ if (ret)
+ return ret;
+
+ if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G))
+ cable_atten = cache[QSFP_CU_ATTEN_12G_OFFS];
+ else if ((lss & OPA_LINK_SPEED_12_5G) &&
+ (lse & OPA_LINK_SPEED_12_5G))
+ cable_atten = cache[QSFP_CU_ATTEN_7G_OFFS];
+
+ /* Fallback to configured attenuation if cable memory is bad */
+ if (cable_atten == 0 || cable_atten > 36) {
+ ret = get_platform_config_field(
+ ppd->dd,
+ PLATFORM_CONFIG_SYSTEM_TABLE, 0,
+ SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+ &cable_atten, 4);
+ if (ret)
+ return ret;
+ }
+
+ ret = get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
+ if (ret)
+ return ret;
+
+ *ptr_total_atten = platform_atten + cable_atten + remote_atten;
+
+ *ptr_tuning_method = OPA_PASSIVE_TUNING;
+ break;
+ case 0x0 ... 0x9: /* fallthrough */
+ case 0xC: /* fallthrough */
+ case 0xE:
+ ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset,
+ ptr_total_atten);
+ if (ret)
+ return ret;
+
+ *ptr_tuning_method = OPA_ACTIVE_TUNING;
+ break;
+ case 0xD: /* fallthrough */
+ case 0xF:
+ default:
+ dd_dev_info(ppd->dd, "%s: Unknown/unsupported cable\n",
+ __func__);
+ break;
+ }
+ return ret;
+}
+
+/*
+ * This function communicates its success or failure via ppd->driver_link_ready
+ * Thus, it depends on its association with start_link(...) which checks
+ * driver_link_ready before proceeding with the link negotiation and
+ * initialization process.
+ */
+void tune_serdes(struct hfi1_pportdata *ppd)
+{
+ int ret = 0;
+ u32 total_atten = 0;
+ u32 remote_atten = 0, platform_atten = 0;
+ u32 rx_preset_index, tx_preset_index;
+ u8 tuning_method = 0, limiting_active = 0;
+ struct hfi1_devdata *dd = ppd->dd;
+
+ rx_preset_index = OPA_INVALID_INDEX;
+ tx_preset_index = OPA_INVALID_INDEX;
+
+ /* the link defaults to enabled */
+ ppd->link_enabled = 1;
+ /* the driver link ready state defaults to not ready */
+ ppd->driver_link_ready = 0;
+ ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
+
+ /* Skip the tuning for testing (loopback != none) and simulations */
+ if (loopback != LOOPBACK_NONE ||
+ ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+ ppd->driver_link_ready = 1;
+ return;
+ }
+
+ switch (ppd->port_type) {
+ case PORT_TYPE_DISCONNECTED:
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED);
+ dd_dev_info(dd, "%s: Port disconnected, disabling port\n",
+ __func__);
+ goto bail;
+ case PORT_TYPE_FIXED:
+ /* platform_atten, remote_atten pre-zeroed to catch error */
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_LOCAL_ATTEN_25G, &platform_atten, 4);
+
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4);
+
+ total_atten = platform_atten + remote_atten;
+
+ tuning_method = OPA_PASSIVE_TUNING;
+ break;
+ case PORT_TYPE_VARIABLE:
+ if (qsfp_mod_present(ppd)) {
+ /*
+ * platform_atten, remote_atten pre-zeroed to
+ * catch error
+ */
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_LOCAL_ATTEN_25G,
+ &platform_atten, 4);
+
+ get_platform_config_field(
+ ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0,
+ PORT_TABLE_REMOTE_ATTEN_25G,
+ &remote_atten, 4);
+
+ total_atten = platform_atten + remote_atten;
+
+ tuning_method = OPA_PASSIVE_TUNING;
+ } else {
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG);
+ goto bail;
+ }
+ break;
+ case PORT_TYPE_QSFP:
+ if (qsfp_mod_present(ppd)) {
+ ret = acquire_chip_resource(ppd->dd,
+ qsfp_resource(ppd->dd),
+ QSFP_WAIT);
+ if (ret) {
+ dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n",
+ __func__, (int)ppd->dd->hfi1_id);
+ goto bail;
+ }
+ refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+
+ if (ppd->qsfp_info.cache_valid) {
+ ret = tune_qsfp(ppd,
+ &tx_preset_index,
+ &rx_preset_index,
+ &tuning_method,
+ &total_atten);
+
+ /*
+ * We may have modified the QSFP memory, so
+ * update the cache to reflect the changes
+ */
+ refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+ limiting_active =
+ ppd->qsfp_info.limiting_active;
+ } else {
+ dd_dev_err(dd,
+ "%s: Reading QSFP memory failed\n",
+ __func__);
+ ret = -EINVAL; /* a fail indication */
+ }
+ release_chip_resource(ppd->dd, qsfp_resource(ppd->dd));
+ if (ret)
+ goto bail;
+ } else {
+ ppd->offline_disabled_reason =
+ HFI1_ODR_MASK(
+ OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED);
+ goto bail;
+ }
+ break;
+ default:
+ dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__);
+ ppd->port_type = PORT_TYPE_UNKNOWN;
+ tuning_method = OPA_UNKNOWN_TUNING;
+ total_atten = 0;
+ limiting_active = 0;
+ tx_preset_index = OPA_INVALID_INDEX;
+ break;
+ }
+
+ if (ppd->offline_disabled_reason ==
+ HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))
+ apply_tunings(ppd, tx_preset_index, tuning_method,
+ total_atten, limiting_active);
+
+ if (!ret)
+ ppd->driver_link_ready = 1;
+
+ return;
+bail:
+ ppd->driver_link_ready = 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/platform.h b/drivers/infiniband/hw/hfi1/platform.h
new file mode 100644
index 000000000000..e2c21613c326
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/platform.h
@@ -0,0 +1,305 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef __PLATFORM_H
+#define __PLATFORM_H
+
+#define METADATA_TABLE_FIELD_START_SHIFT 0
+#define METADATA_TABLE_FIELD_START_LEN_BITS 15
+#define METADATA_TABLE_FIELD_LEN_SHIFT 16
+#define METADATA_TABLE_FIELD_LEN_LEN_BITS 16
+
+/* Header structure */
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT 0
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS 6
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT 16
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS 12
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT 28
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS 4
+
+enum platform_config_table_type_encoding {
+ PLATFORM_CONFIG_TABLE_RESERVED,
+ PLATFORM_CONFIG_SYSTEM_TABLE,
+ PLATFORM_CONFIG_PORT_TABLE,
+ PLATFORM_CONFIG_RX_PRESET_TABLE,
+ PLATFORM_CONFIG_TX_PRESET_TABLE,
+ PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
+ PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
+ PLATFORM_CONFIG_TABLE_MAX
+};
+
+enum platform_config_system_table_fields {
+ SYSTEM_TABLE_RESERVED,
+ SYSTEM_TABLE_NODE_STRING,
+ SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
+ SYSTEM_TABLE_NODE_GUID,
+ SYSTEM_TABLE_REVISION,
+ SYSTEM_TABLE_VENDOR_OUI,
+ SYSTEM_TABLE_META_VERSION,
+ SYSTEM_TABLE_DEVICE_ID,
+ SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
+ SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
+ SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
+ SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+ SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
+ SYSTEM_TABLE_MAX
+};
+
+enum platform_config_port_table_fields {
+ PORT_TABLE_RESERVED,
+ PORT_TABLE_PORT_TYPE,
+ PORT_TABLE_LOCAL_ATTEN_12G,
+ PORT_TABLE_LOCAL_ATTEN_25G,
+ PORT_TABLE_LINK_SPEED_SUPPORTED,
+ PORT_TABLE_LINK_WIDTH_SUPPORTED,
+ PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
+ PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
+ PORT_TABLE_VL_CAP,
+ PORT_TABLE_MTU_CAP,
+ PORT_TABLE_TX_LANE_ENABLE_MASK,
+ PORT_TABLE_LOCAL_MAX_TIMEOUT,
+ PORT_TABLE_REMOTE_ATTEN_12G,
+ PORT_TABLE_REMOTE_ATTEN_25G,
+ PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+ PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+ PORT_TABLE_RX_PRESET_IDX,
+ PORT_TABLE_CABLE_REACH_CLASS,
+ PORT_TABLE_MAX
+};
+
+enum platform_config_rx_preset_table_fields {
+ RX_PRESET_TABLE_RESERVED,
+ RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+ RX_PRESET_TABLE_QSFP_RX_EMP_APPLY,
+ RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+ RX_PRESET_TABLE_QSFP_RX_CDR,
+ RX_PRESET_TABLE_QSFP_RX_EMP,
+ RX_PRESET_TABLE_QSFP_RX_AMP,
+ RX_PRESET_TABLE_MAX
+};
+
+enum platform_config_tx_preset_table_fields {
+ TX_PRESET_TABLE_RESERVED,
+ TX_PRESET_TABLE_PRECUR,
+ TX_PRESET_TABLE_ATTN,
+ TX_PRESET_TABLE_POSTCUR,
+ TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
+ TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+ TX_PRESET_TABLE_QSFP_TX_CDR,
+ TX_PRESET_TABLE_QSFP_TX_EQ,
+ TX_PRESET_TABLE_MAX
+};
+
+enum platform_config_qsfp_attn_table_fields {
+ QSFP_ATTEN_TABLE_RESERVED,
+ QSFP_ATTEN_TABLE_TX_PRESET_IDX,
+ QSFP_ATTEN_TABLE_RX_PRESET_IDX,
+ QSFP_ATTEN_TABLE_MAX
+};
+
+enum platform_config_variable_settings_table_fields {
+ VARIABLE_SETTINGS_TABLE_RESERVED,
+ VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
+ VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
+ VARIABLE_SETTINGS_TABLE_MAX
+};
+
+struct platform_config {
+ size_t size;
+ const u8 *data;
+};
+
+struct platform_config_data {
+ u32 *table;
+ u32 *table_metadata;
+ u32 num_table;
+};
+
+/*
+ * This struct acts as a quick reference into the platform_data binary image
+ * and is populated by parse_platform_config(...) depending on the specific
+ * META_VERSION
+ */
+struct platform_config_cache {
+ u8 cache_valid;
+ struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
+};
+
+static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
+ 0,
+ SYSTEM_TABLE_MAX,
+ PORT_TABLE_MAX,
+ RX_PRESET_TABLE_MAX,
+ TX_PRESET_TABLE_MAX,
+ QSFP_ATTEN_TABLE_MAX,
+ VARIABLE_SETTINGS_TABLE_MAX
+};
+
+/* This section defines default values and encodings for the
+ * fields defined for each table above
+ */
+
+/*
+ * =====================================================
+ * System table encodings
+ * =====================================================
+ */
+#define PLATFORM_CONFIG_MAGIC_NUM 0x3d4f5041
+#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN 4
+
+/*
+ * These power classes are the same as defined in SFF 8636 spec rev 2.4
+ * describing byte 129 in table 6-16, except enumerated in a different order
+ */
+enum platform_config_qsfp_power_class_encoding {
+ QSFP_POWER_CLASS_1 = 1,
+ QSFP_POWER_CLASS_2,
+ QSFP_POWER_CLASS_3,
+ QSFP_POWER_CLASS_4,
+ QSFP_POWER_CLASS_5,
+ QSFP_POWER_CLASS_6,
+ QSFP_POWER_CLASS_7
+};
+
+/*
+ * ====================================================
+ * Port table encodings
+ * ====================================================
+ */
+enum platform_config_port_type_encoding {
+ PORT_TYPE_UNKNOWN,
+ PORT_TYPE_DISCONNECTED,
+ PORT_TYPE_FIXED,
+ PORT_TYPE_VARIABLE,
+ PORT_TYPE_QSFP,
+ PORT_TYPE_MAX
+};
+
+enum platform_config_link_speed_supported_encoding {
+ LINK_SPEED_SUPP_12G = 1,
+ LINK_SPEED_SUPP_25G,
+ LINK_SPEED_SUPP_12G_25G,
+ LINK_SPEED_SUPP_MAX
+};
+
+/*
+ * This is a subset (not strict) of the link downgrades
+ * supported. The link downgrades supported are expected
+ * to be supplied to the driver by another entity such as
+ * the fabric manager
+ */
+enum platform_config_link_width_supported_encoding {
+ LINK_WIDTH_SUPP_1X = 1,
+ LINK_WIDTH_SUPP_2X,
+ LINK_WIDTH_SUPP_2X_1X,
+ LINK_WIDTH_SUPP_3X,
+ LINK_WIDTH_SUPP_3X_1X,
+ LINK_WIDTH_SUPP_3X_2X,
+ LINK_WIDTH_SUPP_3X_2X_1X,
+ LINK_WIDTH_SUPP_4X,
+ LINK_WIDTH_SUPP_4X_1X,
+ LINK_WIDTH_SUPP_4X_2X,
+ LINK_WIDTH_SUPP_4X_2X_1X,
+ LINK_WIDTH_SUPP_4X_3X,
+ LINK_WIDTH_SUPP_4X_3X_1X,
+ LINK_WIDTH_SUPP_4X_3X_2X,
+ LINK_WIDTH_SUPP_4X_3X_2X_1X,
+ LINK_WIDTH_SUPP_MAX
+};
+
+enum platform_config_virtual_lane_capability_encoding {
+ VL_CAP_VL0 = 1,
+ VL_CAP_VL0_1,
+ VL_CAP_VL0_2,
+ VL_CAP_VL0_3,
+ VL_CAP_VL0_4,
+ VL_CAP_VL0_5,
+ VL_CAP_VL0_6,
+ VL_CAP_VL0_7,
+ VL_CAP_VL0_8,
+ VL_CAP_VL0_9,
+ VL_CAP_VL0_10,
+ VL_CAP_VL0_11,
+ VL_CAP_VL0_12,
+ VL_CAP_VL0_13,
+ VL_CAP_VL0_14,
+ VL_CAP_MAX
+};
+
+/* Max MTU */
+enum platform_config_mtu_capability_encoding {
+ MTU_CAP_256 = 1,
+ MTU_CAP_512 = 2,
+ MTU_CAP_1024 = 3,
+ MTU_CAP_2048 = 4,
+ MTU_CAP_4096 = 5,
+ MTU_CAP_8192 = 6,
+ MTU_CAP_10240 = 7
+};
+
+enum platform_config_local_max_timeout_encoding {
+ LOCAL_MAX_TIMEOUT_10_MS = 1,
+ LOCAL_MAX_TIMEOUT_100_MS,
+ LOCAL_MAX_TIMEOUT_1_S,
+ LOCAL_MAX_TIMEOUT_10_S,
+ LOCAL_MAX_TIMEOUT_100_S,
+ LOCAL_MAX_TIMEOUT_1000_S
+};
+
+enum link_tuning_encoding {
+ OPA_PASSIVE_TUNING,
+ OPA_ACTIVE_TUNING,
+ OPA_UNKNOWN_TUNING
+};
+
+/* platform.c */
+void get_platform_config(struct hfi1_devdata *dd);
+void free_platform_config(struct hfi1_devdata *dd);
+void get_port_type(struct hfi1_pportdata *ppd);
+int set_qsfp_tx(struct hfi1_pportdata *ppd, int on);
+void tune_serdes(struct hfi1_pportdata *ppd);
+
+#endif /*__PLATFORM_H*/
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
new file mode 100644
index 000000000000..4e4d8317c281
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -0,0 +1,1032 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+#include <rdma/ib_verbs.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+#include "verbs_txreq.h"
+
+unsigned int hfi1_qp_table_size = 256;
+module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+static void flush_tx_list(struct rvt_qp *qp);
+static int iowait_sleep(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *stx,
+ unsigned seq);
+static void iowait_wakeup(struct iowait *wait, int reason);
+static void iowait_sdma_drained(struct iowait *wait);
+static void qp_pio_drain(struct rvt_qp *qp);
+
+static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
+ struct rvt_qpn_map *map, unsigned off)
+{
+ return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
+}
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static const u16 credit_table[31] = {
+ 0, /* 0 */
+ 1, /* 1 */
+ 2, /* 2 */
+ 3, /* 3 */
+ 4, /* 4 */
+ 6, /* 5 */
+ 8, /* 6 */
+ 12, /* 7 */
+ 16, /* 8 */
+ 24, /* 9 */
+ 32, /* A */
+ 48, /* B */
+ 64, /* C */
+ 96, /* D */
+ 128, /* E */
+ 192, /* F */
+ 256, /* 10 */
+ 384, /* 11 */
+ 512, /* 12 */
+ 768, /* 13 */
+ 1024, /* 14 */
+ 1536, /* 15 */
+ 2048, /* 16 */
+ 3072, /* 17 */
+ 4096, /* 18 */
+ 6144, /* 19 */
+ 8192, /* 1A */
+ 12288, /* 1B */
+ 16384, /* 1C */
+ 24576, /* 1D */
+ 32768 /* 1E */
+};
+
+const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
+[IB_WR_RDMA_WRITE] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_RDMA_READ] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_ATOMIC,
+},
+
+[IB_WR_ATOMIC_CMP_AND_SWP] = {
+ .length = sizeof(struct ib_atomic_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_ATOMIC_FETCH_AND_ADD] = {
+ .length = sizeof(struct ib_atomic_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_RDMA_WRITE_WITH_IMM] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND] = {
+ .length = sizeof(struct ib_send_wr),
+ .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+ BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND_WITH_IMM] = {
+ .length = sizeof(struct ib_send_wr),
+ .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+ BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_REG_MR] = {
+ .length = sizeof(struct ib_reg_wr),
+ .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_LOCAL,
+},
+
+[IB_WR_LOCAL_INV] = {
+ .length = sizeof(struct ib_send_wr),
+ .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_LOCAL,
+},
+
+[IB_WR_SEND_WITH_INV] = {
+ .length = sizeof(struct ib_send_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+},
+
+};
+
+static void flush_tx_list(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ while (!list_empty(&priv->s_iowait.tx_head)) {
+ struct sdma_txreq *tx;
+
+ tx = list_first_entry(
+ &priv->s_iowait.tx_head,
+ struct sdma_txreq,
+ list);
+ list_del_init(&tx->list);
+ hfi1_put_txreq(
+ container_of(tx, struct verbs_txreq, txreq));
+ }
+}
+
+static void flush_iowait(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ unsigned long flags;
+
+ write_seqlock_irqsave(&dev->iowait_lock, flags);
+ if (!list_empty(&priv->s_iowait.list)) {
+ list_del_init(&priv->s_iowait.list);
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+ write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+}
+
+static inline int opa_mtu_enum_to_int(int mtu)
+{
+ switch (mtu) {
+ case OPA_MTU_8192: return 8192;
+ case OPA_MTU_10240: return 10240;
+ default: return -1;
+ }
+}
+
+/**
+ * This function is what we would push to the core layer if we wanted to be a
+ * "first class citizen". Instead we hide this here and rely on Verbs ULPs
+ * to blindly pass the MTU enum value from the PathRecord to us.
+ */
+static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
+{
+ int val;
+
+ /* Constraining 10KB packets to 8KB packets */
+ if (mtu == (enum ib_mtu)OPA_MTU_10240)
+ mtu = OPA_MTU_8192;
+ val = opa_mtu_enum_to_int((int)mtu);
+ if (val > 0)
+ return val;
+ return ib_mtu_enum_to_int(mtu);
+}
+
+int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct hfi1_ibdev *dev = to_idev(ibqp->device);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ u8 sc;
+
+ if (attr_mask & IB_QP_AV) {
+ sc = ah_to_sc(ibqp->device, &attr->ah_attr);
+ if (sc == 0xf)
+ return -EINVAL;
+
+ if (!qp_to_sdma_engine(qp, sc) &&
+ dd->flags & HFI1_HAS_SEND_DMA)
+ return -EINVAL;
+
+ if (!qp_to_send_context(qp, sc))
+ return -EINVAL;
+ }
+
+ if (attr_mask & IB_QP_ALT_PATH) {
+ sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr);
+ if (sc == 0xf)
+ return -EINVAL;
+
+ if (!qp_to_sdma_engine(qp, sc) &&
+ dd->flags & HFI1_HAS_SEND_DMA)
+ return -EINVAL;
+
+ if (!qp_to_send_context(qp, sc))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (attr_mask & IB_QP_AV) {
+ priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+ priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+ priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+ }
+
+ if (attr_mask & IB_QP_PATH_MIG_STATE &&
+ attr->path_mig_state == IB_MIG_MIGRATED &&
+ qp->s_mig_state == IB_MIG_ARMED) {
+ qp->s_flags |= RVT_S_AHG_CLEAR;
+ priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+ priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+ priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+ }
+}
+
+/**
+ * hfi1_check_send_wqe - validate wqe
+ * @qp - The qp
+ * @wqe - The built wqe
+ *
+ * validate wqe. This is called
+ * prior to inserting the wqe into
+ * the ring but after the wqe has been
+ * setup.
+ *
+ * Returns 0 on success, -EINVAL on failure
+ *
+ */
+int hfi1_check_send_wqe(struct rvt_qp *qp,
+ struct rvt_swqe *wqe)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct rvt_ah *ah;
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ if (wqe->length > 0x80000000U)
+ return -EINVAL;
+ break;
+ case IB_QPT_SMI:
+ ah = ibah_to_rvtah(wqe->ud_wr.ah);
+ if (wqe->length > (1 << ah->log_pmtu))
+ return -EINVAL;
+ break;
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ ah = ibah_to_rvtah(wqe->ud_wr.ah);
+ if (wqe->length > (1 << ah->log_pmtu))
+ return -EINVAL;
+ if (ibp->sl_to_sc[ah->attr.sl] == 0xf)
+ return -EINVAL;
+ default:
+ break;
+ }
+ return wqe->length <= piothreshold;
+}
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct rvt_qp *qp)
+{
+ u32 aeth = qp->r_msn & HFI1_MSN_MASK;
+
+ if (qp->ibqp.srq) {
+ /*
+ * Shared receive queues don't generate credits.
+ * Set the credit field to the invalid value.
+ */
+ aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT;
+ } else {
+ u32 min, max, x;
+ u32 credits;
+ struct rvt_rwq *wq = qp->r_rq.wq;
+ u32 head;
+ u32 tail;
+
+ /* sanity check pointers before trusting them */
+ head = wq->head;
+ if (head >= qp->r_rq.size)
+ head = 0;
+ tail = wq->tail;
+ if (tail >= qp->r_rq.size)
+ tail = 0;
+ /*
+ * Compute the number of credits available (RWQEs).
+ * There is a small chance that the pair of reads are
+ * not atomic, which is OK, since the fuzziness is
+ * resolved as further ACKs go out.
+ */
+ credits = head - tail;
+ if ((int)credits < 0)
+ credits += qp->r_rq.size;
+ /*
+ * Binary search the credit table to find the code to
+ * use.
+ */
+ min = 0;
+ max = 31;
+ for (;;) {
+ x = (min + max) / 2;
+ if (credit_table[x] == credits)
+ break;
+ if (credit_table[x] > credits) {
+ max = x;
+ } else {
+ if (min == x)
+ break;
+ min = x;
+ }
+ }
+ aeth |= x << HFI1_AETH_CREDIT_SHIFT;
+ }
+ return cpu_to_be32(aeth);
+}
+
+/**
+ * _hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress w/o regard to the s_flags.
+ *
+ * It is only used in the post send, which doesn't hold
+ * the s_lock.
+ */
+void _hfi1_schedule_send(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+ iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
+ priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)));
+}
+
+static void qp_pio_drain(struct rvt_qp *qp)
+{
+ struct hfi1_ibdev *dev;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (!priv->s_sendcontext)
+ return;
+ dev = to_idev(qp->ibqp.device);
+ while (iowait_pio_pending(&priv->s_iowait)) {
+ write_seqlock_irq(&dev->iowait_lock);
+ hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
+ write_sequnlock_irq(&dev->iowait_lock);
+ iowait_pio_drain(&priv->s_iowait);
+ write_seqlock_irq(&dev->iowait_lock);
+ hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
+ write_sequnlock_irq(&dev->iowait_lock);
+ }
+}
+
+/**
+ * hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress and caller should hold
+ * the s_lock.
+ */
+void hfi1_schedule_send(struct rvt_qp *qp)
+{
+ if (hfi1_send_ok(qp))
+ _hfi1_schedule_send(qp);
+}
+
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct rvt_qp *qp, u32 aeth)
+{
+ u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK;
+
+ /*
+ * If the credit is invalid, we can send
+ * as many packets as we like. Otherwise, we have to
+ * honor the credit field.
+ */
+ if (credit == HFI1_AETH_CREDIT_INVAL) {
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+ qp->s_flags |= RVT_S_UNLIMITED_CREDIT;
+ if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+ qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+ hfi1_schedule_send(qp);
+ }
+ }
+ } else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+ /* Compute new LSN (i.e., MSN + credit) */
+ credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK;
+ if (cmp_msn(credit, qp->s_lsn) > 0) {
+ qp->s_lsn = credit;
+ if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+ qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+ hfi1_schedule_send(qp);
+ }
+ }
+ }
+}
+
+void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (qp->s_flags & flag) {
+ qp->s_flags &= ~flag;
+ trace_hfi1_qpwakeup(qp, flag);
+ hfi1_schedule_send(qp);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ /* Notify hfi1_destroy_qp() if it is waiting. */
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+static int iowait_sleep(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *stx,
+ unsigned seq)
+{
+ struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
+ struct rvt_qp *qp;
+ struct hfi1_qp_priv *priv;
+ unsigned long flags;
+ int ret = 0;
+ struct hfi1_ibdev *dev;
+
+ qp = tx->qp;
+ priv = qp->priv;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+ /*
+ * If we couldn't queue the DMA request, save the info
+ * and try again later rather than destroying the
+ * buffer and undoing the side effects of the copy.
+ */
+ /* Make a common routine? */
+ dev = &sde->dd->verbs_dev;
+ list_add_tail(&stx->list, &wait->tx_head);
+ write_seqlock(&dev->iowait_lock);
+ if (sdma_progress(sde, seq, stx))
+ goto eagain;
+ if (list_empty(&priv->s_iowait.list)) {
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+
+ ibp->rvp.n_dmawait++;
+ qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+ list_add_tail(&priv->s_iowait.list, &sde->dmawait);
+ trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
+ atomic_inc(&qp->refcount);
+ }
+ write_sequnlock(&dev->iowait_lock);
+ qp->s_flags &= ~RVT_S_BUSY;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ ret = -EBUSY;
+ } else {
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ hfi1_put_txreq(tx);
+ }
+ return ret;
+eagain:
+ write_sequnlock(&dev->iowait_lock);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ list_del_init(&stx->list);
+ return -EAGAIN;
+}
+
+static void iowait_wakeup(struct iowait *wait, int reason)
+{
+ struct rvt_qp *qp = iowait_to_qp(wait);
+
+ WARN_ON(reason != SDMA_AVAIL_REASON);
+ hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC);
+}
+
+static void iowait_sdma_drained(struct iowait *wait)
+{
+ struct rvt_qp *qp = iowait_to_qp(wait);
+ unsigned long flags;
+
+ /*
+ * This happens when the send engine notes
+ * a QP in the error state and cannot
+ * do the flush work until that QP's
+ * sdma work has finished.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (qp->s_flags & RVT_S_WAIT_DMA) {
+ qp->s_flags &= ~RVT_S_WAIT_DMA;
+ hfi1_schedule_send(qp);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/**
+ *
+ * qp_to_sdma_engine - map a qp to a send engine
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send engine for the qp or NULL for SMI type qp.
+ */
+struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+ struct sdma_engine *sde;
+
+ if (!(dd->flags & HFI1_HAS_SEND_DMA))
+ return NULL;
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_SMI:
+ return NULL;
+ default:
+ break;
+ }
+ sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
+ return sde;
+}
+
+/*
+ * qp_to_send_context - map a qp to a send context
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send context for the qp
+ */
+struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_SMI:
+ /* SMA packets to VL15 */
+ return dd->vld[15].sc;
+ default:
+ break;
+ }
+
+ return pio_select_send_context_sc(dd, qp->ibqp.qp_num >> dd->qos_shift,
+ sc5);
+}
+
+struct qp_iter {
+ struct hfi1_ibdev *dev;
+ struct rvt_qp *qp;
+ int specials;
+ int n;
+};
+
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev)
+{
+ struct qp_iter *iter;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->dev = dev;
+ iter->specials = dev->rdi.ibdev.phys_port_cnt * 2;
+
+ return iter;
+}
+
+int qp_iter_next(struct qp_iter *iter)
+{
+ struct hfi1_ibdev *dev = iter->dev;
+ int n = iter->n;
+ int ret = 1;
+ struct rvt_qp *pqp = iter->qp;
+ struct rvt_qp *qp;
+
+ /*
+ * The approach is to consider the special qps
+ * as an additional table entries before the
+ * real hash table. Since the qp code sets
+ * the qp->next hash link to NULL, this works just fine.
+ *
+ * iter->specials is 2 * # ports
+ *
+ * n = 0..iter->specials is the special qp indices
+ *
+ * n = iter->specials..dev->rdi.qp_dev->qp_table_size+iter->specials are
+ * the potential hash bucket entries
+ *
+ */
+ for (; n < dev->rdi.qp_dev->qp_table_size + iter->specials; n++) {
+ if (pqp) {
+ qp = rcu_dereference(pqp->next);
+ } else {
+ if (n < iter->specials) {
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ibport *ibp;
+ int pidx;
+
+ pidx = n % dev->rdi.ibdev.phys_port_cnt;
+ ppd = &dd_from_dev(dev)->pport[pidx];
+ ibp = &ppd->ibport_data;
+
+ if (!(n & 1))
+ qp = rcu_dereference(ibp->rvp.qp[0]);
+ else
+ qp = rcu_dereference(ibp->rvp.qp[1]);
+ } else {
+ qp = rcu_dereference(
+ dev->rdi.qp_dev->qp_table[
+ (n - iter->specials)]);
+ }
+ }
+ pqp = qp;
+ if (qp) {
+ iter->qp = qp;
+ iter->n = n;
+ return 0;
+ }
+ }
+ return ret;
+}
+
+static const char * const qp_type_str[] = {
+ "SMI", "GSI", "RC", "UC", "UD",
+};
+
+static int qp_idle(struct rvt_qp *qp)
+{
+ return
+ qp->s_last == qp->s_acked &&
+ qp->s_acked == qp->s_cur &&
+ qp->s_cur == qp->s_tail &&
+ qp->s_tail == qp->s_head;
+}
+
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
+{
+ struct rvt_swqe *wqe;
+ struct rvt_qp *qp = iter->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct sdma_engine *sde;
+ struct send_context *send_context;
+
+ sde = qp_to_sdma_engine(qp, priv->s_sc);
+ wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+ send_context = qp_to_send_context(qp, priv->s_sc);
+ seq_printf(s,
+ "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n",
+ iter->n,
+ qp_idle(qp) ? "I" : "B",
+ qp->ibqp.qp_num,
+ atomic_read(&qp->refcount),
+ qp_type_str[qp->ibqp.qp_type],
+ qp->state,
+ wqe ? wqe->wr.opcode : 0,
+ qp->s_hdrwords,
+ qp->s_flags,
+ iowait_sdma_pending(&priv->s_iowait),
+ iowait_pio_pending(&priv->s_iowait),
+ !list_empty(&priv->s_iowait.list),
+ qp->timeout,
+ wqe ? wqe->ssn : 0,
+ qp->s_lsn,
+ qp->s_last_psn,
+ qp->s_psn, qp->s_next_psn,
+ qp->s_sending_psn, qp->s_sending_hpsn,
+ qp->s_last, qp->s_acked, qp->s_cur,
+ qp->s_tail, qp->s_head, qp->s_size,
+ qp->s_avail,
+ qp->remote_qpn,
+ qp->remote_ah_attr.dlid,
+ qp->remote_ah_attr.sl,
+ qp->pmtu,
+ qp->s_retry,
+ qp->s_retry_cnt,
+ qp->s_rnr_retry_cnt,
+ sde,
+ sde ? sde->this_idx : 0,
+ send_context,
+ send_context ? send_context->sw_index : 0,
+ ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
+ ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
+ qp->pid);
+}
+
+void qp_comm_est(struct rvt_qp *qp)
+{
+ qp->r_flags |= RVT_R_COMM_EST;
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_COMM_EST;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+}
+
+void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ gfp_t gfp)
+{
+ struct hfi1_qp_priv *priv;
+
+ priv = kzalloc_node(sizeof(*priv), gfp, rdi->dparms.node);
+ if (!priv)
+ return ERR_PTR(-ENOMEM);
+
+ priv->owner = qp;
+
+ priv->s_ahg = kzalloc_node(sizeof(*priv->s_ahg), gfp,
+ rdi->dparms.node);
+ if (!priv->s_ahg) {
+ kfree(priv);
+ return ERR_PTR(-ENOMEM);
+ }
+ setup_timer(&priv->s_rnr_timer, hfi1_rc_rnr_retry, (unsigned long)qp);
+ qp->s_timer.function = hfi1_rc_timeout;
+ return priv;
+}
+
+void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ kfree(priv->s_ahg);
+ kfree(priv);
+}
+
+unsigned free_all_qps(struct rvt_dev_info *rdi)
+{
+ struct hfi1_ibdev *verbs_dev = container_of(rdi,
+ struct hfi1_ibdev,
+ rdi);
+ struct hfi1_devdata *dd = container_of(verbs_dev,
+ struct hfi1_devdata,
+ verbs_dev);
+ int n;
+ unsigned qp_inuse = 0;
+
+ for (n = 0; n < dd->num_pports; n++) {
+ struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
+
+ rcu_read_lock();
+ if (rcu_dereference(ibp->rvp.qp[0]))
+ qp_inuse++;
+ if (rcu_dereference(ibp->rvp.qp[1]))
+ qp_inuse++;
+ rcu_read_unlock();
+ }
+
+ return qp_inuse;
+}
+
+void flush_qp_waiters(struct rvt_qp *qp)
+{
+ flush_iowait(qp);
+ hfi1_stop_rc_timers(qp);
+}
+
+void stop_send_queue(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ cancel_work_sync(&priv->s_iowait.iowork);
+ hfi1_del_timers_sync(qp);
+}
+
+void quiesce_qp(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ iowait_sdma_drain(&priv->s_iowait);
+ qp_pio_drain(qp);
+ flush_tx_list(qp);
+}
+
+void notify_qp_reset(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ iowait_init(
+ &priv->s_iowait,
+ 1,
+ _hfi1_do_send,
+ iowait_sleep,
+ iowait_wakeup,
+ iowait_sdma_drained);
+ priv->r_adefered = 0;
+ clear_ahg(qp);
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void hfi1_migrate_qp(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_event ev;
+
+ qp->s_mig_state = IB_MIG_MIGRATED;
+ qp->remote_ah_attr = qp->alt_ah_attr;
+ qp->port_num = qp->alt_ah_attr.port_num;
+ qp->s_pkey_index = qp->s_alt_pkey_index;
+ qp->s_flags |= RVT_S_AHG_CLEAR;
+ priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr);
+ priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_PATH_MIG;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+int mtu_to_path_mtu(u32 mtu)
+{
+ return mtu_to_enum(mtu, OPA_MTU_8192);
+}
+
+u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu)
+{
+ u32 mtu;
+ struct hfi1_ibdev *verbs_dev = container_of(rdi,
+ struct hfi1_ibdev,
+ rdi);
+ struct hfi1_devdata *dd = container_of(verbs_dev,
+ struct hfi1_devdata,
+ verbs_dev);
+ struct hfi1_ibport *ibp;
+ u8 sc, vl;
+
+ ibp = &dd->pport[qp->port_num - 1].ibport_data;
+ sc = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+ vl = sc_to_vlt(dd, sc);
+
+ mtu = verbs_mtu_enum_to_int(qp->ibqp.device, pmtu);
+ if (vl < PER_VL_SEND_CONTEXTS)
+ mtu = min_t(u32, mtu, dd->vld[vl].mtu);
+ return mtu;
+}
+
+int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ struct ib_qp_attr *attr)
+{
+ int mtu, pidx = qp->port_num - 1;
+ struct hfi1_ibdev *verbs_dev = container_of(rdi,
+ struct hfi1_ibdev,
+ rdi);
+ struct hfi1_devdata *dd = container_of(verbs_dev,
+ struct hfi1_devdata,
+ verbs_dev);
+ mtu = verbs_mtu_enum_to_int(qp->ibqp.device, attr->path_mtu);
+ if (mtu == -1)
+ return -1; /* values less than 0 are error */
+
+ if (mtu > dd->pport[pidx].ibmtu)
+ return mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
+ else
+ return attr->path_mtu;
+}
+
+void notify_error_qp(struct rvt_qp *qp)
+{
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ write_seqlock(&dev->iowait_lock);
+ if (!list_empty(&priv->s_iowait.list) && !(qp->s_flags & RVT_S_BUSY)) {
+ qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+ list_del_init(&priv->s_iowait.list);
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+ }
+ write_sequnlock(&dev->iowait_lock);
+
+ if (!(qp->s_flags & RVT_S_BUSY)) {
+ qp->s_hdrwords = 0;
+ if (qp->s_rdma_mr) {
+ rvt_put_mr(qp->s_rdma_mr);
+ qp->s_rdma_mr = NULL;
+ }
+ flush_tx_list(qp);
+ }
+}
+
+/**
+ * hfi1_error_port_qps - put a port's RC/UC qps into error state
+ * @ibp: the ibport.
+ * @sl: the service level.
+ *
+ * This function places all RC/UC qps with a given service level into error
+ * state. It is generally called to force upper lay apps to abandon stale qps
+ * after an sl->sc mapping change.
+ */
+void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl)
+{
+ struct rvt_qp *qp = NULL;
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_ibdev *dev = &ppd->dd->verbs_dev;
+ int n;
+ int lastwqe;
+ struct ib_event ev;
+
+ rcu_read_lock();
+
+ /* Deal only with RC/UC qps that use the given SL. */
+ for (n = 0; n < dev->rdi.qp_dev->qp_table_size; n++) {
+ for (qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); qp;
+ qp = rcu_dereference(qp->next)) {
+ if (qp->port_num == ppd->port &&
+ (qp->ibqp.qp_type == IB_QPT_UC ||
+ qp->ibqp.qp_type == IB_QPT_RC) &&
+ qp->remote_ah_attr.sl == sl &&
+ (ib_rvt_state_ops[qp->state] &
+ RVT_POST_SEND_OK)) {
+ spin_lock_irq(&qp->r_lock);
+ spin_lock(&qp->s_hlock);
+ spin_lock(&qp->s_lock);
+ lastwqe = rvt_error_qp(qp,
+ IB_WC_WR_FLUSH_ERR);
+ spin_unlock(&qp->s_lock);
+ spin_unlock(&qp->s_hlock);
+ spin_unlock_irq(&qp->r_lock);
+ if (lastwqe) {
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event =
+ IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev,
+ qp->ibqp.qp_context);
+ }
+ }
+ }
+ }
+
+ rcu_read_unlock();
+}
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
new file mode 100644
index 000000000000..587d84d65bb8
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -0,0 +1,162 @@
+#ifndef _QP_H
+#define _QP_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/hash.h>
+#include <rdma/rdmavt_qp.h>
+#include "verbs.h"
+#include "sdma.h"
+
+extern unsigned int hfi1_qp_table_size;
+
+extern const struct rvt_operation_params hfi1_post_parms[];
+
+/*
+ * free_ahg - clear ahg from QP
+ */
+static inline void clear_ahg(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ priv->s_ahg->ahgcount = 0;
+ qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR);
+ if (priv->s_sde && qp->s_ahgidx >= 0)
+ sdma_ahg_free(priv->s_sde, qp->s_ahgidx);
+ qp->s_ahgidx = -1;
+}
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct rvt_qp *qp);
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata);
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct rvt_qp *qp, u32 aeth);
+
+/**
+ * hfi1_qp_wakeup - wake up on the indicated event
+ * @qp: the QP
+ * @flag: flag the qp on which the qp is stalled
+ */
+void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag);
+
+struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5);
+struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5);
+
+struct qp_iter;
+
+/**
+ * qp_iter_init - initialize the iterator for the qp hash list
+ * @dev: the hfi1_ibdev
+ */
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev);
+
+/**
+ * qp_iter_next - Find the next qp in the hash list
+ * @iter: the iterator for the qp hash list
+ */
+int qp_iter_next(struct qp_iter *iter);
+
+/**
+ * qp_iter_print - print the qp information to seq_file
+ * @s: the seq_file to emit the qp information on
+ * @iter: the iterator for the qp hash list
+ */
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
+
+/**
+ * qp_comm_est - handle trap with QP established
+ * @qp: the QP
+ */
+void qp_comm_est(struct rvt_qp *qp);
+
+void _hfi1_schedule_send(struct rvt_qp *qp);
+void hfi1_schedule_send(struct rvt_qp *qp);
+
+void hfi1_migrate_qp(struct rvt_qp *qp);
+
+/*
+ * Functions provided by hfi1 driver for rdmavt to use
+ */
+void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ gfp_t gfp);
+void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+unsigned free_all_qps(struct rvt_dev_info *rdi);
+void notify_qp_reset(struct rvt_qp *qp);
+int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ struct ib_qp_attr *attr);
+void flush_qp_waiters(struct rvt_qp *qp);
+void notify_error_qp(struct rvt_qp *qp);
+void stop_send_queue(struct rvt_qp *qp);
+void quiesce_qp(struct rvt_qp *qp);
+u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
+int mtu_to_path_mtu(u32 mtu);
+void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl);
+#endif /* _QP_H */
diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c
new file mode 100644
index 000000000000..4e95ad810847
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/qsfp.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+
+/* for the given bus number, return the CSR for reading an i2c line */
+static inline u32 i2c_in_csr(u32 bus_num)
+{
+ return bus_num ? ASIC_QSFP2_IN : ASIC_QSFP1_IN;
+}
+
+/* for the given bus number, return the CSR for writing an i2c line */
+static inline u32 i2c_oe_csr(u32 bus_num)
+{
+ return bus_num ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+}
+
+static void hfi1_setsda(void *data, int state)
+{
+ struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+ struct hfi1_devdata *dd = bus->controlling_dd;
+ u64 reg;
+ u32 target_oe;
+
+ target_oe = i2c_oe_csr(bus->num);
+ reg = read_csr(dd, target_oe);
+ /*
+ * The OE bit value is inverted and connected to the pin. When
+ * OE is 0 the pin is left to be pulled up, when the OE is 1
+ * the pin is driven low. This matches the "open drain" or "open
+ * collector" convention.
+ */
+ if (state)
+ reg &= ~QSFP_HFI0_I2CDAT;
+ else
+ reg |= QSFP_HFI0_I2CDAT;
+ write_csr(dd, target_oe, reg);
+ /* do a read to force the write into the chip */
+ (void)read_csr(dd, target_oe);
+}
+
+static void hfi1_setscl(void *data, int state)
+{
+ struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+ struct hfi1_devdata *dd = bus->controlling_dd;
+ u64 reg;
+ u32 target_oe;
+
+ target_oe = i2c_oe_csr(bus->num);
+ reg = read_csr(dd, target_oe);
+ /*
+ * The OE bit value is inverted and connected to the pin. When
+ * OE is 0 the pin is left to be pulled up, when the OE is 1
+ * the pin is driven low. This matches the "open drain" or "open
+ * collector" convention.
+ */
+ if (state)
+ reg &= ~QSFP_HFI0_I2CCLK;
+ else
+ reg |= QSFP_HFI0_I2CCLK;
+ write_csr(dd, target_oe, reg);
+ /* do a read to force the write into the chip */
+ (void)read_csr(dd, target_oe);
+}
+
+static int hfi1_getsda(void *data)
+{
+ struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+ u64 reg;
+ u32 target_in;
+
+ hfi1_setsda(data, 1); /* clear OE so we do not pull line down */
+ udelay(2); /* 1us pull up + 250ns hold */
+
+ target_in = i2c_in_csr(bus->num);
+ reg = read_csr(bus->controlling_dd, target_in);
+ return !!(reg & QSFP_HFI0_I2CDAT);
+}
+
+static int hfi1_getscl(void *data)
+{
+ struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+ u64 reg;
+ u32 target_in;
+
+ hfi1_setscl(data, 1); /* clear OE so we do not pull line down */
+ udelay(2); /* 1us pull up + 250ns hold */
+
+ target_in = i2c_in_csr(bus->num);
+ reg = read_csr(bus->controlling_dd, target_in);
+ return !!(reg & QSFP_HFI0_I2CCLK);
+}
+
+/*
+ * Allocate and initialize the given i2c bus number.
+ * Returns NULL on failure.
+ */
+static struct hfi1_i2c_bus *init_i2c_bus(struct hfi1_devdata *dd,
+ struct hfi1_asic_data *ad, int num)
+{
+ struct hfi1_i2c_bus *bus;
+ int ret;
+
+ bus = kzalloc(sizeof(*bus), GFP_KERNEL);
+ if (!bus)
+ return NULL;
+
+ bus->controlling_dd = dd;
+ bus->num = num; /* our bus number */
+
+ bus->algo.setsda = hfi1_setsda;
+ bus->algo.setscl = hfi1_setscl;
+ bus->algo.getsda = hfi1_getsda;
+ bus->algo.getscl = hfi1_getscl;
+ bus->algo.udelay = 5;
+ bus->algo.timeout = usecs_to_jiffies(50);
+ bus->algo.data = bus;
+
+ bus->adapter.owner = THIS_MODULE;
+ bus->adapter.algo_data = &bus->algo;
+ bus->adapter.dev.parent = &dd->pcidev->dev;
+ snprintf(bus->adapter.name, sizeof(bus->adapter.name),
+ "hfi1_i2c%d", num);
+
+ ret = i2c_bit_add_bus(&bus->adapter);
+ if (ret) {
+ dd_dev_info(dd, "%s: unable to add i2c bus %d, err %d\n",
+ __func__, num, ret);
+ kfree(bus);
+ return NULL;
+ }
+
+ return bus;
+}
+
+/*
+ * Initialize i2c buses.
+ * Return 0 on success, -errno on error.
+ */
+int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad)
+{
+ ad->i2c_bus0 = init_i2c_bus(dd, ad, 0);
+ ad->i2c_bus1 = init_i2c_bus(dd, ad, 1);
+ if (!ad->i2c_bus0 || !ad->i2c_bus1)
+ return -ENOMEM;
+ return 0;
+};
+
+static void clean_i2c_bus(struct hfi1_i2c_bus *bus)
+{
+ if (bus) {
+ i2c_del_adapter(&bus->adapter);
+ kfree(bus);
+ }
+}
+
+void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad)
+{
+ clean_i2c_bus(ad->i2c_bus0);
+ ad->i2c_bus0 = NULL;
+ clean_i2c_bus(ad->i2c_bus1);
+ ad->i2c_bus1 = NULL;
+}
+
+static int i2c_bus_write(struct hfi1_devdata *dd, struct hfi1_i2c_bus *i2c,
+ u8 slave_addr, int offset, int offset_size,
+ u8 *data, u16 len)
+{
+ int ret;
+ int num_msgs;
+ u8 offset_bytes[2];
+ struct i2c_msg msgs[2];
+
+ switch (offset_size) {
+ case 0:
+ num_msgs = 1;
+ msgs[0].addr = slave_addr;
+ msgs[0].flags = 0;
+ msgs[0].len = len;
+ msgs[0].buf = data;
+ break;
+ case 2:
+ offset_bytes[1] = (offset >> 8) & 0xff;
+ /* fall through */
+ case 1:
+ num_msgs = 2;
+ offset_bytes[0] = offset & 0xff;
+
+ msgs[0].addr = slave_addr;
+ msgs[0].flags = 0;
+ msgs[0].len = offset_size;
+ msgs[0].buf = offset_bytes;
+
+ msgs[1].addr = slave_addr;
+ msgs[1].flags = I2C_M_NOSTART,
+ msgs[1].len = len;
+ msgs[1].buf = data;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ i2c->controlling_dd = dd;
+ ret = i2c_transfer(&i2c->adapter, msgs, num_msgs);
+ if (ret != num_msgs) {
+ dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; write failed, ret %d\n",
+ __func__, i2c->num, slave_addr, offset, len, ret);
+ return ret < 0 ? ret : -EIO;
+ }
+ return 0;
+}
+
+static int i2c_bus_read(struct hfi1_devdata *dd, struct hfi1_i2c_bus *bus,
+ u8 slave_addr, int offset, int offset_size,
+ u8 *data, u16 len)
+{
+ int ret;
+ int num_msgs;
+ u8 offset_bytes[2];
+ struct i2c_msg msgs[2];
+
+ switch (offset_size) {
+ case 0:
+ num_msgs = 1;
+ msgs[0].addr = slave_addr;
+ msgs[0].flags = I2C_M_RD;
+ msgs[0].len = len;
+ msgs[0].buf = data;
+ break;
+ case 2:
+ offset_bytes[1] = (offset >> 8) & 0xff;
+ /* fall through */
+ case 1:
+ num_msgs = 2;
+ offset_bytes[0] = offset & 0xff;
+
+ msgs[0].addr = slave_addr;
+ msgs[0].flags = 0;
+ msgs[0].len = offset_size;
+ msgs[0].buf = offset_bytes;
+
+ msgs[1].addr = slave_addr;
+ msgs[1].flags = I2C_M_RD,
+ msgs[1].len = len;
+ msgs[1].buf = data;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ bus->controlling_dd = dd;
+ ret = i2c_transfer(&bus->adapter, msgs, num_msgs);
+ if (ret != num_msgs) {
+ dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; read failed, ret %d\n",
+ __func__, bus->num, slave_addr, offset, len, ret);
+ return ret < 0 ? ret : -EIO;
+ }
+ return 0;
+}
+
+/*
+ * Raw i2c write. No set-up or lock checking.
+ *
+ * Return 0 on success, -errno on error.
+ */
+static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+ int offset, void *bp, int len)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct hfi1_i2c_bus *bus;
+ u8 slave_addr;
+ int offset_size;
+
+ bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0;
+ slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */
+ offset_size = (i2c_addr >> 8) & 0x3;
+ return i2c_bus_write(dd, bus, slave_addr, offset, offset_size, bp, len);
+}
+
+/*
+ * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes written, or -errno.
+ */
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+ void *bp, int len)
+{
+ int ret;
+
+ if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+ return -EACCES;
+
+ ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+ if (ret)
+ return ret;
+
+ return len;
+}
+
+/*
+ * Raw i2c read. No set-up or lock checking.
+ *
+ * Return 0 on success, -errno on error.
+ */
+static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+ int offset, void *bp, int len)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ struct hfi1_i2c_bus *bus;
+ u8 slave_addr;
+ int offset_size;
+
+ bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0;
+ slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */
+ offset_size = (i2c_addr >> 8) & 0x3;
+ return i2c_bus_read(dd, bus, slave_addr, offset, offset_size, bp, len);
+}
+
+/*
+ * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes read, or -errno.
+ */
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+ void *bp, int len)
+{
+ int ret;
+
+ if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+ return -EACCES;
+
+ ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+ if (ret)
+ return ret;
+
+ return len;
+}
+
+/*
+ * Write page n, offset m of QSFP memory as defined by SFF 8636
+ * by writing @addr = ((256 * n) + m)
+ *
+ * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes written or -errno.
+ */
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len)
+{
+ int count = 0;
+ int offset;
+ int nwrite;
+ int ret = 0;
+ u8 page;
+
+ if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+ return -EACCES;
+
+ while (count < len) {
+ /*
+ * Set the qsfp page based on a zero-based address
+ * and a page size of QSFP_PAGESIZE bytes.
+ */
+ page = (u8)(addr / QSFP_PAGESIZE);
+
+ ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+ QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+ /* QSFPs require a 5-10msec delay after write operations */
+ mdelay(5);
+ if (ret) {
+ hfi1_dev_porterr(ppd->dd, ppd->port,
+ "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
+ target, ret);
+ break;
+ }
+
+ offset = addr % QSFP_PAGESIZE;
+ nwrite = len - count;
+ /* truncate write to boundary if crossing boundary */
+ if (((addr % QSFP_RW_BOUNDARY) + nwrite) > QSFP_RW_BOUNDARY)
+ nwrite = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
+
+ ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+ offset, bp + count, nwrite);
+ /* QSFPs require a 5-10msec delay after write operations */
+ mdelay(5);
+ if (ret) /* stop on error */
+ break;
+
+ count += nwrite;
+ addr += nwrite;
+ }
+
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+/*
+ * Perform a stand-alone single QSFP write. Acquire the resource, do the
+ * write, then release the resource.
+ */
+int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 resource = qsfp_resource(dd);
+ int ret;
+
+ ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
+ if (ret)
+ return ret;
+ ret = qsfp_write(ppd, target, addr, bp, len);
+ release_chip_resource(dd, resource);
+
+ return ret;
+}
+
+/*
+ * Access page n, offset m of QSFP memory as defined by SFF 8636
+ * by reading @addr = ((256 * n) + m)
+ *
+ * Caller must hold the i2c chain resource.
+ *
+ * Return the number of bytes read or -errno.
+ */
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len)
+{
+ int count = 0;
+ int offset;
+ int nread;
+ int ret = 0;
+ u8 page;
+
+ if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
+ return -EACCES;
+
+ while (count < len) {
+ /*
+ * Set the qsfp page based on a zero-based address
+ * and a page size of QSFP_PAGESIZE bytes.
+ */
+ page = (u8)(addr / QSFP_PAGESIZE);
+ ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+ QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+ /* QSFPs require a 5-10msec delay after write operations */
+ mdelay(5);
+ if (ret) {
+ hfi1_dev_porterr(ppd->dd, ppd->port,
+ "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
+ target, ret);
+ break;
+ }
+
+ offset = addr % QSFP_PAGESIZE;
+ nread = len - count;
+ /* truncate read to boundary if crossing boundary */
+ if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY)
+ nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
+
+ ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
+ offset, bp + count, nread);
+ if (ret) /* stop on error */
+ break;
+
+ count += nread;
+ addr += nread;
+ }
+
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+/*
+ * Perform a stand-alone single QSFP read. Acquire the resource, do the
+ * read, then release the resource.
+ */
+int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u32 resource = qsfp_resource(dd);
+ int ret;
+
+ ret = acquire_chip_resource(dd, resource, QSFP_WAIT);
+ if (ret)
+ return ret;
+ ret = qsfp_read(ppd, target, addr, bp, len);
+ release_chip_resource(dd, resource);
+
+ return ret;
+}
+
+/*
+ * This function caches the QSFP memory range in 128 byte chunks.
+ * As an example, the next byte after address 255 is byte 128 from
+ * upper page 01H (if existing) rather than byte 0 from lower page 00H.
+ * Access page n, offset m of QSFP memory as defined by SFF 8636
+ * in the cache by reading byte ((128 * n) + m)
+ * The calls to qsfp_{read,write} in this function correctly handle the
+ * address map difference between this mapping and the mapping implemented
+ * by those functions
+ *
+ * The caller must be holding the QSFP i2c chain resource.
+ */
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
+{
+ u32 target = ppd->dd->hfi1_id;
+ int ret;
+ unsigned long flags;
+ u8 *cache = &cp->cache[0];
+
+ /* ensure sane contents on invalid reads, for cable swaps */
+ memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ ppd->qsfp_info.cache_valid = 0;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+ if (!qsfp_mod_present(ppd)) {
+ ret = -ENODEV;
+ goto bail;
+ }
+
+ ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE);
+ if (ret != QSFP_PAGESIZE) {
+ dd_dev_info(ppd->dd,
+ "%s: Page 0 read failed, expected %d, got %d\n",
+ __func__, QSFP_PAGESIZE, ret);
+ goto bail;
+ }
+
+ /* Is paging enabled? */
+ if (!(cache[2] & 4)) {
+ /* Paging enabled, page 03 required */
+ if ((cache[195] & 0xC0) == 0xC0) {
+ /* all */
+ ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s failed\n", __func__);
+ goto bail;
+ }
+ ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s failed\n", __func__);
+ goto bail;
+ }
+ ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s failed\n", __func__);
+ goto bail;
+ }
+ } else if ((cache[195] & 0x80) == 0x80) {
+ /* only page 2 and 3 */
+ ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s failed\n", __func__);
+ goto bail;
+ }
+ ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s failed\n", __func__);
+ goto bail;
+ }
+ } else if ((cache[195] & 0x40) == 0x40) {
+ /* only page 1 and 3 */
+ ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s failed\n", __func__);
+ goto bail;
+ }
+ ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s failed\n", __func__);
+ goto bail;
+ }
+ } else {
+ /* only page 3 */
+ ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+ if (ret <= 0 || ret != 128) {
+ dd_dev_info(ppd->dd, "%s failed\n", __func__);
+ goto bail;
+ }
+ }
+ }
+
+ spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+ ppd->qsfp_info.cache_valid = 1;
+ ppd->qsfp_info.cache_refresh_required = 0;
+ spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+ return 0;
+
+bail:
+ memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128));
+ return ret;
+}
+
+const char * const hfi1_qsfp_devtech[16] = {
+ "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+ "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+ "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+ "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3)
+/* For use with QSFP_HIGH_PWR macro */
+#define QSFP_HIGH_PWR_UNUSED 0 /* Bits [1:0] = 00 implies low power module */
+
+/*
+ * Takes power class byte [Page 00 Byte 129] in SFF 8636
+ * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4)
+ */
+int get_qsfp_power_class(u8 power_byte)
+{
+ if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED)
+ /* power classes count from 1, their bit encodings from 0 */
+ return (QSFP_PWR(power_byte) + 1);
+ /*
+ * 00 in the high power classes stands for unused, bringing
+ * balance to the off-by-1 offset above, we add 4 here to
+ * account for the difference between the low and high power
+ * groups
+ */
+ return (QSFP_HIGH_PWR(power_byte) + 4);
+}
+
+int qsfp_mod_present(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_devdata *dd = ppd->dd;
+ u64 reg;
+
+ reg = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+ return !(reg & QSFP_HFI0_MODPRST_N);
+}
+
+/*
+ * This function maps QSFP memory addresses in 128 byte chunks in the following
+ * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
+ * spec
+ * For addr 000-127, lower page 00h
+ * For addr 128-255, upper page 00h
+ * For addr 256-383, upper page 01h
+ * For addr 384-511, upper page 02h
+ * For addr 512-639, upper page 03h
+ *
+ * For addresses beyond this range, it returns the invalid range of data buffer
+ * set to 0.
+ * For upper pages that are optional, if they are not valid, returns the
+ * particular range of bytes in the data buffer set to 0.
+ */
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
+ u8 *data)
+{
+ struct hfi1_pportdata *ppd;
+ u32 excess_len = len;
+ int ret = 0, offset = 0;
+
+ if (port_num > dd->num_pports || port_num < 1) {
+ dd_dev_info(dd, "%s: Invalid port number %d\n",
+ __func__, port_num);
+ ret = -EINVAL;
+ goto set_zeroes;
+ }
+
+ ppd = dd->pport + (port_num - 1);
+ if (!qsfp_mod_present(ppd)) {
+ ret = -ENODEV;
+ goto set_zeroes;
+ }
+
+ if (!ppd->qsfp_info.cache_valid) {
+ ret = -EINVAL;
+ goto set_zeroes;
+ }
+
+ if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
+ ret = -ERANGE;
+ goto set_zeroes;
+ }
+
+ if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
+ excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
+ memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
+ data += (len - excess_len);
+ goto set_zeroes;
+ }
+
+ memcpy(data, &ppd->qsfp_info.cache[addr], len);
+
+ if (addr <= QSFP_MONITOR_VAL_END &&
+ (addr + len) >= QSFP_MONITOR_VAL_START) {
+ /* Overlap with the dynamic channel monitor range */
+ if (addr < QSFP_MONITOR_VAL_START) {
+ if (addr + len <= QSFP_MONITOR_VAL_END)
+ len = addr + len - QSFP_MONITOR_VAL_START;
+ else
+ len = QSFP_MONITOR_RANGE;
+ offset = QSFP_MONITOR_VAL_START - addr;
+ addr = QSFP_MONITOR_VAL_START;
+ } else if (addr == QSFP_MONITOR_VAL_START) {
+ offset = 0;
+ if (addr + len > QSFP_MONITOR_VAL_END)
+ len = QSFP_MONITOR_RANGE;
+ } else {
+ offset = 0;
+ if (addr + len > QSFP_MONITOR_VAL_END)
+ len = QSFP_MONITOR_VAL_END - addr + 1;
+ }
+ /* Refresh the values of the dynamic monitors from the cable */
+ ret = one_qsfp_read(ppd, dd->hfi1_id, addr, data + offset, len);
+ if (ret != len) {
+ ret = -EAGAIN;
+ goto set_zeroes;
+ }
+ }
+
+ return 0;
+
+set_zeroes:
+ memset(data, 0, excess_len);
+ return ret;
+}
+
+static const char *pwr_codes[8] = {"N/AW",
+ "1.5W",
+ "2.0W",
+ "2.5W",
+ "3.5W",
+ "4.0W",
+ "4.5W",
+ "5.0W"
+ };
+
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
+{
+ u8 *cache = &ppd->qsfp_info.cache[0];
+ u8 bin_buff[QSFP_DUMP_CHUNK];
+ char lenstr[6];
+ int sofar;
+ int bidx = 0;
+ u8 *atten = &cache[QSFP_ATTEN_OFFS];
+ u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+ u8 power_byte = 0;
+
+ sofar = 0;
+ lenstr[0] = ' ';
+ lenstr[1] = '\0';
+
+ if (ppd->qsfp_info.cache_valid) {
+ if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+ snprintf(lenstr, sizeof(lenstr), "%dM ",
+ cache[QSFP_MOD_LEN_OFFS]);
+
+ power_byte = cache[QSFP_MOD_PWR_OFFS];
+ sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
+ pwr_codes[get_qsfp_power_class(power_byte)]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
+ lenstr,
+ hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+ QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+ QSFP_OUI(vendor_oui));
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+ QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+ QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
+
+ if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+ sofar += scnprintf(buf + sofar, len - sofar,
+ "Atten:%d, %d\n",
+ QSFP_ATTEN_SDR(atten),
+ QSFP_ATTEN_DDR(atten));
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+ QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+ QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
+
+ sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+ QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
+
+ while (bidx < QSFP_DEFAULT_HDR_CNT) {
+ int iidx;
+
+ memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
+ for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
+ sofar += scnprintf(buf + sofar, len - sofar,
+ " %02X", bin_buff[iidx]);
+ }
+ sofar += scnprintf(buf + sofar, len - sofar, "\n");
+ bidx += QSFP_DUMP_CHUNK;
+ }
+ }
+ return sofar;
+}
diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h
new file mode 100644
index 000000000000..36cf52359848
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/qsfp.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/* QSFP support common definitions, for hfi driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+/* 128 byte pages, per SFF 8636 rev 2.4 */
+#define QSFP_MAX_NUM_PAGES 5
+
+/*
+ * Below are masks for QSFP pins. Pins are the same for HFI0 and HFI1.
+ * _N means asserted low
+ */
+#define QSFP_HFI0_I2CCLK BIT(0)
+#define QSFP_HFI0_I2CDAT BIT(1)
+#define QSFP_HFI0_RESET_N BIT(2)
+#define QSFP_HFI0_INT_N BIT(3)
+#define QSFP_HFI0_MODPRST_N BIT(4)
+
+/* QSFP is paged at 256 bytes */
+#define QSFP_PAGESIZE 256
+/* Reads/writes cannot cross 128 byte boundaries */
+#define QSFP_RW_BOUNDARY 128
+
+/* number of bytes in i2c offset for QSFP devices */
+#define __QSFP_OFFSET_SIZE 1 /* num address bytes */
+#define QSFP_OFFSET_SIZE (__QSFP_OFFSET_SIZE << 8) /* shifted value */
+
+/* Defined fields that Intel requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+#define QSFP_MONITOR_VAL_START 22
+#define QSFP_MONITOR_VAL_END 81
+#define QSFP_MONITOR_RANGE (QSFP_MONITOR_VAL_END - QSFP_MONITOR_VAL_START + 1)
+#define QSFP_TX_CTRL_BYTE_OFFS 86
+#define QSFP_PWR_CTRL_BYTE_OFFS 93
+#define QSFP_CDR_CTRL_BYTE_OFFS 98
+
+#define QSFP_PAGE_SELECT_BYTE_OFFS 127
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier".
+ * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not Intel req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec */
+#define QSFP_NOM_BIT_RATE_100_OFFS 140
+/* Byte 141 is Extended Rate Select. Not Intel req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Intel req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const hfi1_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+ oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR 0x009065
+#define QSFP_OUI_GORE 0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
+ * If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/*
+ * Bytes 188,189 are Wavelength tolerance, if optical
+ * If copper, they are attenuation in dB:
+ * Byte 188 is at 12.5 Gb/s, Byte 189 at 25 Gb/s
+ */
+#define QSFP_CU_ATTEN_7G_OFFS 188
+#define QSFP_CU_ATTEN_12G_OFFS 189
+/* Byte 190 is Max Case Temp. Not Intel req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
+#define QSFP_CC_OFFS 191
+#define QSFP_EQ_INFO_OFFS 193
+#define QSFP_CDR_INFO_OFFS 194
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
+/* Byte 222 indicates nominal bitrate in units of 250Mbits/sec */
+#define QSFP_NOM_BIT_RATE_250_OFFS 222
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * Interrupt flag masks
+ */
+#define QSFP_DATA_NOT_READY 0x01
+
+#define QSFP_HIGH_TEMP_ALARM 0x80
+#define QSFP_LOW_TEMP_ALARM 0x40
+#define QSFP_HIGH_TEMP_WARNING 0x20
+#define QSFP_LOW_TEMP_WARNING 0x10
+
+#define QSFP_HIGH_VCC_ALARM 0x80
+#define QSFP_LOW_VCC_ALARM 0x40
+#define QSFP_HIGH_VCC_WARNING 0x20
+#define QSFP_LOW_VCC_WARNING 0x10
+
+#define QSFP_HIGH_POWER_ALARM 0x88
+#define QSFP_LOW_POWER_ALARM 0x44
+#define QSFP_HIGH_POWER_WARNING 0x22
+#define QSFP_LOW_POWER_WARNING 0x11
+
+#define QSFP_HIGH_BIAS_ALARM 0x88
+#define QSFP_LOW_BIAS_ALARM 0x44
+#define QSFP_HIGH_BIAS_WARNING 0x22
+#define QSFP_LOW_BIAS_WARNING 0x11
+
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+/*
+ * struct qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the qsfp_lock arbitrate access to common resources.
+ *
+ */
+struct qsfp_data {
+ /* Helps to find our way */
+ struct hfi1_pportdata *ppd;
+ struct work_struct qsfp_work;
+ u8 cache[QSFP_MAX_NUM_PAGES * 128];
+ /* protect qsfp data */
+ spinlock_t qsfp_lock;
+ u8 check_interrupt_flags;
+ u8 reset_needed;
+ u8 limiting_active;
+ u8 cache_valid;
+ u8 cache_refresh_required;
+};
+
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
+ struct qsfp_data *cp);
+int get_qsfp_power_class(u8 power_byte);
+int qsfp_mod_present(struct hfi1_pportdata *ppd);
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
+ u32 len, u8 *data);
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+ int offset, void *bp, int len);
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+ int offset, void *bp, int len);
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len);
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len);
+int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len);
+int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+ int len);
+struct hfi1_asic_data;
+int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad);
+void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad);
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
new file mode 100644
index 000000000000..5da190e6011b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -0,0 +1,2620 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/io.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+#include "trace.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+/**
+ * hfi1_add_retry_timer - add/start a retry timer
+ * @qp - the QP
+ *
+ * add a retry timer on the QP
+ */
+static inline void hfi1_add_retry_timer(struct rvt_qp *qp)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ qp->s_flags |= RVT_S_TIMER;
+ /* 4.096 usec. * (1 << qp->timeout) */
+ qp->s_timer.expires = jiffies + qp->timeout_jiffies +
+ rdi->busy_jiffies;
+ add_timer(&qp->s_timer);
+}
+
+/**
+ * hfi1_add_rnr_timer - add/start an rnr timer
+ * @qp - the QP
+ * @to - timeout in usecs
+ *
+ * add an rnr timer on the QP
+ */
+void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ qp->s_flags |= RVT_S_WAIT_RNR;
+ qp->s_timer.expires = jiffies + usecs_to_jiffies(to);
+ add_timer(&priv->s_rnr_timer);
+}
+
+/**
+ * hfi1_mod_retry_timer - mod a retry timer
+ * @qp - the QP
+ *
+ * Modify a potentially already running retry
+ * timer
+ */
+static inline void hfi1_mod_retry_timer(struct rvt_qp *qp)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ qp->s_flags |= RVT_S_TIMER;
+ /* 4.096 usec. * (1 << qp->timeout) */
+ mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
+ rdi->busy_jiffies);
+}
+
+/**
+ * hfi1_stop_retry_timer - stop a retry timer
+ * @qp - the QP
+ *
+ * stop a retry timer and return if the timer
+ * had been pending.
+ */
+static inline int hfi1_stop_retry_timer(struct rvt_qp *qp)
+{
+ int rval = 0;
+
+ /* Remove QP from retry */
+ if (qp->s_flags & RVT_S_TIMER) {
+ qp->s_flags &= ~RVT_S_TIMER;
+ rval = del_timer(&qp->s_timer);
+ }
+ return rval;
+}
+
+/**
+ * hfi1_stop_rc_timers - stop all timers
+ * @qp - the QP
+ *
+ * stop any pending timers
+ */
+void hfi1_stop_rc_timers(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ /* Remove QP from all timers */
+ if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
+ qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
+ del_timer(&qp->s_timer);
+ del_timer(&priv->s_rnr_timer);
+ }
+}
+
+/**
+ * hfi1_stop_rnr_timer - stop an rnr timer
+ * @qp - the QP
+ *
+ * stop an rnr timer and return if the timer
+ * had been pending.
+ */
+static inline int hfi1_stop_rnr_timer(struct rvt_qp *qp)
+{
+ int rval = 0;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ /* Remove QP from rnr timer */
+ if (qp->s_flags & RVT_S_WAIT_RNR) {
+ qp->s_flags &= ~RVT_S_WAIT_RNR;
+ rval = del_timer(&priv->s_rnr_timer);
+ }
+ return rval;
+}
+
+/**
+ * hfi1_del_timers_sync - wait for any timeout routines to exit
+ * @qp - the QP
+ */
+void hfi1_del_timers_sync(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ del_timer_sync(&qp->s_timer);
+ del_timer_sync(&priv->s_rnr_timer);
+}
+
+/* only opcode mask for adaptive pio */
+const u32 rc_only_opcode =
+ BIT(OP(SEND_ONLY) & 0x1f) |
+ BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
+ BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
+ BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)) |
+ BIT(OP(RDMA_READ_REQUEST & 0x1f)) |
+ BIT(OP(ACKNOWLEDGE & 0x1f)) |
+ BIT(OP(ATOMIC_ACKNOWLEDGE & 0x1f)) |
+ BIT(OP(COMPARE_SWAP & 0x1f)) |
+ BIT(OP(FETCH_ADD & 0x1f));
+
+static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+ u32 psn, u32 pmtu)
+{
+ u32 len;
+
+ len = delta_psn(psn, wqe->psn) * pmtu;
+ ss->sge = wqe->sg_list[0];
+ ss->sg_list = wqe->sg_list + 1;
+ ss->num_sge = wqe->wr.num_sge;
+ ss->total_len = wqe->length;
+ hfi1_skip_sge(ss, len, 0);
+ return wqe->length - len;
+}
+
+/**
+ * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @ps: the xmit packet state
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
+ struct hfi1_other_headers *ohdr,
+ struct hfi1_pkt_state *ps)
+{
+ struct rvt_ack_entry *e;
+ u32 hwords;
+ u32 len;
+ u32 bth0;
+ u32 bth2;
+ int middle = 0;
+ u32 pmtu = qp->pmtu;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ /* Don't send an ACK if we aren't supposed to. */
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+ goto bail;
+
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+
+ switch (qp->s_ack_state) {
+ case OP(RDMA_READ_RESPONSE_LAST):
+ case OP(RDMA_READ_RESPONSE_ONLY):
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ /* FALLTHROUGH */
+ case OP(ATOMIC_ACKNOWLEDGE):
+ /*
+ * We can increment the tail pointer now that the last
+ * response has been sent instead of only being
+ * constructed.
+ */
+ if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
+ qp->s_tail_ack_queue = 0;
+ /* FALLTHROUGH */
+ case OP(SEND_ONLY):
+ case OP(ACKNOWLEDGE):
+ /* Check for no next entry in the queue. */
+ if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+ if (qp->s_flags & RVT_S_ACK_PENDING)
+ goto normal;
+ goto bail;
+ }
+
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ if (e->opcode == OP(RDMA_READ_REQUEST)) {
+ /*
+ * If a RDMA read response is being resent and
+ * we haven't seen the duplicate request yet,
+ * then stop sending the remaining responses the
+ * responder has seen until the requester re-sends it.
+ */
+ len = e->rdma_sge.sge_length;
+ if (len && !e->rdma_sge.mr) {
+ qp->s_tail_ack_queue = qp->r_head_ack_queue;
+ goto bail;
+ }
+ /* Copy SGE state in case we need to resend */
+ ps->s_txreq->mr = e->rdma_sge.mr;
+ if (ps->s_txreq->mr)
+ rvt_get_mr(ps->s_txreq->mr);
+ qp->s_ack_rdma_sge.sge = e->rdma_sge;
+ qp->s_ack_rdma_sge.num_sge = 1;
+ qp->s_cur_sge = &qp->s_ack_rdma_sge;
+ if (len > pmtu) {
+ len = pmtu;
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+ } else {
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+ e->sent = 1;
+ }
+ ohdr->u.aeth = hfi1_compute_aeth(qp);
+ hwords++;
+ qp->s_ack_rdma_psn = e->psn;
+ bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ } else {
+ /* COMPARE_SWAP or FETCH_ADD */
+ qp->s_cur_sge = NULL;
+ len = 0;
+ qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+ ohdr->u.at.aeth = hfi1_compute_aeth(qp);
+ ohdr->u.at.atomic_ack_eth[0] =
+ cpu_to_be32(e->atomic_data >> 32);
+ ohdr->u.at.atomic_ack_eth[1] =
+ cpu_to_be32(e->atomic_data);
+ hwords += sizeof(ohdr->u.at) / sizeof(u32);
+ bth2 = mask_psn(e->psn);
+ e->sent = 1;
+ }
+ bth0 = qp->s_ack_state << 24;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_FIRST):
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(RDMA_READ_RESPONSE_MIDDLE):
+ qp->s_cur_sge = &qp->s_ack_rdma_sge;
+ ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
+ if (ps->s_txreq->mr)
+ rvt_get_mr(ps->s_txreq->mr);
+ len = qp->s_ack_rdma_sge.sge.sge_length;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ } else {
+ ohdr->u.aeth = hfi1_compute_aeth(qp);
+ hwords++;
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ e->sent = 1;
+ }
+ bth0 = qp->s_ack_state << 24;
+ bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ break;
+
+ default:
+normal:
+ /*
+ * Send a regular ACK.
+ * Set the s_ack_state so we wait until after sending
+ * the ACK before setting s_ack_state to ACKNOWLEDGE
+ * (see above).
+ */
+ qp->s_ack_state = OP(SEND_ONLY);
+ qp->s_flags &= ~RVT_S_ACK_PENDING;
+ qp->s_cur_sge = NULL;
+ if (qp->s_nak_state)
+ ohdr->u.aeth =
+ cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+ (qp->s_nak_state <<
+ HFI1_AETH_CREDIT_SHIFT));
+ else
+ ohdr->u.aeth = hfi1_compute_aeth(qp);
+ hwords++;
+ len = 0;
+ bth0 = OP(ACKNOWLEDGE) << 24;
+ bth2 = mask_psn(qp->s_ack_psn);
+ }
+ qp->s_rdma_ack_cnt++;
+ qp->s_hdrwords = hwords;
+ ps->s_txreq->sde = priv->s_sde;
+ qp->s_cur_size = len;
+ hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
+ /* pbc */
+ ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+ return 1;
+
+bail:
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ /*
+ * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+ * RVT_S_RESP_PENDING
+ */
+ smp_wmb();
+ qp->s_flags &= ~(RVT_S_RESP_PENDING
+ | RVT_S_ACK_PENDING
+ | RVT_S_AHG_VALID);
+ return 0;
+}
+
+/**
+ * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Assumes s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_other_headers *ohdr;
+ struct rvt_sge_state *ss;
+ struct rvt_swqe *wqe;
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ u32 hwords = 5;
+ u32 len;
+ u32 bth0 = 0;
+ u32 bth2;
+ u32 pmtu = qp->pmtu;
+ char newreq;
+ int middle = 0;
+ int delta;
+
+ ps->s_txreq = get_txreq(ps->dev, qp);
+ if (IS_ERR(ps->s_txreq))
+ goto bail_no_tx;
+
+ ohdr = &ps->s_txreq->phdr.hdr.u.oth;
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
+
+ /* Sending responses has higher priority over sending requests. */
+ if ((qp->s_flags & RVT_S_RESP_PENDING) &&
+ make_rc_ack(dev, qp, ohdr, ps))
+ return 1;
+
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+ if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ smp_read_barrier_depends(); /* see post_one_send() */
+ if (qp->s_last == ACCESS_ONCE(qp->s_head))
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (iowait_sdma_pending(&priv->s_iowait)) {
+ qp->s_flags |= RVT_S_WAIT_DMA;
+ goto bail;
+ }
+ clear_ahg(qp);
+ wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+ hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+ /* will get called again */
+ goto done_free_tx;
+ }
+
+ if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+ goto bail;
+
+ if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+ if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+ qp->s_flags |= RVT_S_WAIT_PSN;
+ goto bail;
+ }
+ qp->s_sending_psn = qp->s_psn;
+ qp->s_sending_hpsn = qp->s_psn - 1;
+ }
+
+ /* Send a request. */
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+ switch (qp->s_state) {
+ default:
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
+ goto bail;
+ /*
+ * Resend an old request or start a new one.
+ *
+ * We keep track of the current SWQE so that
+ * we don't reset the "furthest progress" state
+ * if we need to back up.
+ */
+ newreq = 0;
+ if (qp->s_cur == qp->s_tail) {
+ /* Check if send work queue is empty. */
+ if (qp->s_tail == qp->s_head) {
+ clear_ahg(qp);
+ goto bail;
+ }
+ /*
+ * If a fence is requested, wait for previous
+ * RDMA read and atomic operations to finish.
+ */
+ if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+ qp->s_num_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_FENCE;
+ goto bail;
+ }
+ /*
+ * Local operations are processed immediately
+ * after all prior requests have completed
+ */
+ if (wqe->wr.opcode == IB_WR_REG_MR ||
+ wqe->wr.opcode == IB_WR_LOCAL_INV) {
+ int local_ops = 0;
+ int err = 0;
+
+ if (qp->s_last != qp->s_cur)
+ goto bail;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ if (++qp->s_tail == qp->s_size)
+ qp->s_tail = 0;
+ if (!(wqe->wr.send_flags &
+ RVT_SEND_COMPLETION_ONLY)) {
+ err = rvt_invalidate_rkey(
+ qp,
+ wqe->wr.ex.invalidate_rkey);
+ local_ops = 1;
+ }
+ hfi1_send_complete(qp, wqe,
+ err ? IB_WC_LOC_PROT_ERR
+ : IB_WC_SUCCESS);
+ if (local_ops)
+ atomic_dec(&qp->local_ops_pending);
+ qp->s_hdrwords = 0;
+ goto done_free_tx;
+ }
+
+ newreq = 1;
+ qp->s_psn = wqe->psn;
+ }
+ /*
+ * Note that we have to be careful not to modify the
+ * original work request since we may need to resend
+ * it.
+ */
+ len = wqe->length;
+ ss = &qp->s_sge;
+ bth2 = mask_psn(qp->s_psn);
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_SEND_WITH_INV:
+ /* If no credit, return. */
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+ cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+ qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+ goto bail;
+ }
+ if (len > pmtu) {
+ qp->s_state = OP(SEND_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND) {
+ qp->s_state = OP(SEND_ONLY);
+ } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+ qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ } else {
+ qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
+ /* Invalidate rkey comes after the BTH */
+ ohdr->u.ieth = cpu_to_be32(
+ wqe->wr.ex.invalidate_rkey);
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ bth2 |= IB_BTH_REQ_ACK;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ /* FALLTHROUGH */
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ /* If no credit, return. */
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+ cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+ qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+ goto bail;
+ }
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->rdma_wr.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->rdma_wr.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ hwords += sizeof(struct ib_reth) / sizeof(u32);
+ if (len > pmtu) {
+ qp->s_state = OP(RDMA_WRITE_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ qp->s_state = OP(RDMA_WRITE_ONLY);
+ } else {
+ qp->s_state =
+ OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after RETH */
+ ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ }
+ bth2 |= IB_BTH_REQ_ACK;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_READ:
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow.
+ */
+ if (newreq) {
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+ qp->s_num_rd_atomic++;
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ }
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->rdma_wr.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->rdma_wr.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+ ss = NULL;
+ len = 0;
+ bth2 |= IB_BTH_REQ_ACK;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow.
+ */
+ if (newreq) {
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+ qp->s_num_rd_atomic++;
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ }
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ qp->s_state = OP(COMPARE_SWAP);
+ ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+ wqe->atomic_wr.swap);
+ ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+ wqe->atomic_wr.compare_add);
+ } else {
+ qp->s_state = OP(FETCH_ADD);
+ ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+ wqe->atomic_wr.compare_add);
+ ohdr->u.atomic_eth.compare_data = 0;
+ }
+ ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+ wqe->atomic_wr.remote_addr >> 32);
+ ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+ wqe->atomic_wr.remote_addr);
+ ohdr->u.atomic_eth.rkey = cpu_to_be32(
+ wqe->atomic_wr.rkey);
+ hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+ ss = NULL;
+ len = 0;
+ bth2 |= IB_BTH_REQ_ACK;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ default:
+ goto bail;
+ }
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ if (newreq) {
+ qp->s_tail++;
+ if (qp->s_tail >= qp->s_size)
+ qp->s_tail = 0;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ qp->s_psn = wqe->lpsn + 1;
+ else
+ qp->s_psn++;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_FIRST):
+ /*
+ * qp->s_state is normally set to the opcode of the
+ * last packet constructed for new requests and therefore
+ * is never set to RDMA read response.
+ * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+ * thread to indicate a SEND needs to be restarted from an
+ * earlier PSN without interfering with the sending thread.
+ * See restart_rc().
+ */
+ qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+ /* FALLTHROUGH */
+ case OP(SEND_FIRST):
+ qp->s_state = OP(SEND_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ bth2 = mask_psn(qp->s_psn++);
+ ss = &qp->s_sge;
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND) {
+ qp->s_state = OP(SEND_LAST);
+ } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+ qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ } else {
+ qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
+ /* invalidate data comes after the BTH */
+ ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ bth2 |= IB_BTH_REQ_ACK;
+ qp->s_cur++;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_LAST):
+ /*
+ * qp->s_state is normally set to the opcode of the
+ * last packet constructed for new requests and therefore
+ * is never set to RDMA read response.
+ * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+ * thread to indicate a RDMA write needs to be restarted from
+ * an earlier PSN without interfering with the sending thread.
+ * See restart_rc().
+ */
+ qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_FIRST):
+ qp->s_state = OP(RDMA_WRITE_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_MIDDLE):
+ bth2 = mask_psn(qp->s_psn++);
+ ss = &qp->s_sge;
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ qp->s_state = OP(RDMA_WRITE_LAST);
+ } else {
+ qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ }
+ bth2 |= IB_BTH_REQ_ACK;
+ qp->s_cur++;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case OP(RDMA_READ_RESPONSE_MIDDLE):
+ /*
+ * qp->s_state is normally set to the opcode of the
+ * last packet constructed for new requests and therefore
+ * is never set to RDMA read response.
+ * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+ * thread to indicate a RDMA read needs to be restarted from
+ * an earlier PSN without interfering with the sending thread.
+ * See restart_rc().
+ */
+ len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->rdma_wr.remote_addr + len);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->rdma_wr.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+ bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
+ qp->s_psn = wqe->lpsn + 1;
+ ss = NULL;
+ len = 0;
+ qp->s_cur++;
+ if (qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+ }
+ qp->s_sending_hpsn = bth2;
+ delta = delta_psn(bth2, wqe->psn);
+ if (delta && delta % HFI1_PSN_CREDIT == 0)
+ bth2 |= IB_BTH_REQ_ACK;
+ if (qp->s_flags & RVT_S_SEND_ONE) {
+ qp->s_flags &= ~RVT_S_SEND_ONE;
+ qp->s_flags |= RVT_S_WAIT_ACK;
+ bth2 |= IB_BTH_REQ_ACK;
+ }
+ qp->s_len -= len;
+ qp->s_hdrwords = hwords;
+ ps->s_txreq->sde = priv->s_sde;
+ qp->s_cur_sge = ss;
+ qp->s_cur_size = len;
+ hfi1_make_ruc_header(
+ qp,
+ ohdr,
+ bth0 | (qp->s_state << 24),
+ bth2,
+ middle,
+ ps);
+ /* pbc */
+ ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+ return 1;
+
+done_free_tx:
+ hfi1_put_txreq(ps->s_txreq);
+ ps->s_txreq = NULL;
+ return 1;
+
+bail:
+ hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+ ps->s_txreq = NULL;
+ qp->s_flags &= ~RVT_S_BUSY;
+ qp->s_hdrwords = 0;
+ return 0;
+}
+
+/**
+ * hfi1_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp,
+ int is_fecn)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ u64 pbc, pbc_flags = 0;
+ u16 lrh0;
+ u16 sc5;
+ u32 bth0;
+ u32 hwords;
+ u32 vl, plen;
+ struct send_context *sc;
+ struct pio_buf *pbuf;
+ struct hfi1_ib_header hdr;
+ struct hfi1_other_headers *ohdr;
+ unsigned long flags;
+
+ /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+ if (qp->s_flags & RVT_S_RESP_PENDING)
+ goto queue_ack;
+
+ /* Ensure s_rdma_ack_cnt changes are committed */
+ smp_read_barrier_depends();
+ if (qp->s_rdma_ack_cnt)
+ goto queue_ack;
+
+ /* Construct the header */
+ /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
+ hwords = 6;
+ if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+ hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
+ &qp->remote_ah_attr.grh, hwords, 0);
+ ohdr = &hdr.u.l.oth;
+ lrh0 = HFI1_LRH_GRH;
+ } else {
+ ohdr = &hdr.u.oth;
+ lrh0 = HFI1_LRH_BTH;
+ }
+ /* read pkey_index w/o lock (its atomic) */
+ bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
+ if (qp->s_mig_state == IB_MIG_MIGRATED)
+ bth0 |= IB_BTH_MIG_REQ;
+ if (qp->r_nak_state)
+ ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+ (qp->r_nak_state <<
+ HFI1_AETH_CREDIT_SHIFT));
+ else
+ ohdr->u.aeth = hfi1_compute_aeth(qp);
+ sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+ /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+ pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
+ lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+ hdr.lrh[0] = cpu_to_be16(lrh0);
+ hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+ hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+ hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+ ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
+ ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
+
+ /* Don't try to send ACKs if the link isn't ACTIVE */
+ if (driver_lstate(ppd) != IB_PORT_ACTIVE)
+ return;
+
+ sc = rcd->sc;
+ plen = 2 /* PBC */ + hwords;
+ vl = sc_to_vlt(ppd->dd, sc5);
+ pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+
+ pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+ if (!pbuf) {
+ /*
+ * We have no room to send at the moment. Pass
+ * responsibility for sending the ACK to the send tasklet
+ * so that when enough buffer space becomes available,
+ * the ACK is sent ahead of other outgoing packets.
+ */
+ goto queue_ack;
+ }
+
+ trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
+
+ /* write the pbc and data */
+ ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
+
+ return;
+
+queue_ack:
+ this_cpu_inc(*ibp->rvp.rc_qacks);
+ spin_lock_irqsave(&qp->s_lock, flags);
+ qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
+ qp->s_nak_state = qp->r_nak_state;
+ qp->s_ack_psn = qp->r_ack_psn;
+ if (is_fecn)
+ qp->s_flags |= RVT_S_ECN;
+
+ /* Schedule the send tasklet. */
+ hfi1_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct rvt_qp *qp, u32 psn)
+{
+ u32 n = qp->s_acked;
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
+ u32 opcode;
+
+ qp->s_cur = n;
+
+ /*
+ * If we are starting the request from the beginning,
+ * let the normal send code handle initialization.
+ */
+ if (cmp_psn(psn, wqe->psn) <= 0) {
+ qp->s_state = OP(SEND_LAST);
+ goto done;
+ }
+
+ /* Find the work request opcode corresponding to the given PSN. */
+ opcode = wqe->wr.opcode;
+ for (;;) {
+ int diff;
+
+ if (++n == qp->s_size)
+ n = 0;
+ if (n == qp->s_tail)
+ break;
+ wqe = rvt_get_swqe_ptr(qp, n);
+ diff = cmp_psn(psn, wqe->psn);
+ if (diff < 0)
+ break;
+ qp->s_cur = n;
+ /*
+ * If we are starting the request from the beginning,
+ * let the normal send code handle initialization.
+ */
+ if (diff == 0) {
+ qp->s_state = OP(SEND_LAST);
+ goto done;
+ }
+ opcode = wqe->wr.opcode;
+ }
+
+ /*
+ * Set the state to restart in the middle of a request.
+ * Don't change the s_sge, s_cur_sge, or s_cur_size.
+ * See hfi1_make_rc_req().
+ */
+ switch (opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+ break;
+
+ case IB_WR_RDMA_READ:
+ qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+ break;
+
+ default:
+ /*
+ * This case shouldn't happen since its only
+ * one PSN per req.
+ */
+ qp->s_state = OP(SEND_LAST);
+ }
+done:
+ qp->s_psn = psn;
+ /*
+ * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
+ * asynchronously before the send tasklet can get scheduled.
+ * Doing it in hfi1_make_rc_req() is too late.
+ */
+ if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+ (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+ qp->s_flags |= RVT_S_WAIT_PSN;
+ qp->s_flags &= ~RVT_S_AHG_VALID;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+static void restart_rc(struct rvt_qp *qp, u32 psn, int wait)
+{
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ struct hfi1_ibport *ibp;
+
+ if (qp->s_retry == 0) {
+ if (qp->s_mig_state == IB_MIG_ARMED) {
+ hfi1_migrate_qp(qp);
+ qp->s_retry = qp->s_retry_cnt;
+ } else if (qp->s_last == qp->s_acked) {
+ hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ return;
+ } else { /* need to handle delayed completion */
+ return;
+ }
+ } else {
+ qp->s_retry--;
+ }
+
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ ibp->rvp.n_rc_resends++;
+ else
+ ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
+
+ qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
+ RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
+ RVT_S_WAIT_ACK);
+ if (wait)
+ qp->s_flags |= RVT_S_SEND_ONE;
+ reset_psn(qp, psn);
+}
+
+/*
+ * This is called from s_timer for missing responses.
+ */
+void hfi1_rc_timeout(unsigned long arg)
+{
+ struct rvt_qp *qp = (struct rvt_qp *)arg;
+ struct hfi1_ibport *ibp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+ spin_lock(&qp->s_lock);
+ if (qp->s_flags & RVT_S_TIMER) {
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ibp->rvp.n_rc_timeouts++;
+ qp->s_flags &= ~RVT_S_TIMER;
+ del_timer(&qp->s_timer);
+ trace_hfi1_timeout(qp, qp->s_last_psn + 1);
+ restart_rc(qp, qp->s_last_psn + 1, 1);
+ hfi1_schedule_send(qp);
+ }
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+void hfi1_rc_rnr_retry(unsigned long arg)
+{
+ struct rvt_qp *qp = (struct rvt_qp *)arg;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_stop_rnr_timer(qp);
+ hfi1_schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
+{
+ struct rvt_swqe *wqe;
+ u32 n = qp->s_last;
+
+ /* Find the work request corresponding to the given PSN. */
+ for (;;) {
+ wqe = rvt_get_swqe_ptr(qp, n);
+ if (cmp_psn(psn, wqe->lpsn) <= 0) {
+ if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ qp->s_sending_psn = wqe->lpsn + 1;
+ else
+ qp->s_sending_psn = psn + 1;
+ break;
+ }
+ if (++n == qp->s_size)
+ n = 0;
+ if (n == qp->s_tail)
+ break;
+ }
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr)
+{
+ struct hfi1_other_headers *ohdr;
+ struct rvt_swqe *wqe;
+ struct ib_wc wc;
+ unsigned i;
+ u32 opcode;
+ u32 psn;
+
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+ return;
+
+ /* Find out where the BTH is */
+ if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else
+ ohdr = &hdr->u.l.oth;
+
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+ if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+ WARN_ON(!qp->s_rdma_ack_cnt);
+ qp->s_rdma_ack_cnt--;
+ return;
+ }
+
+ psn = be32_to_cpu(ohdr->bth[2]);
+ reset_sending_psn(qp, psn);
+
+ /*
+ * Start timer after a packet requesting an ACK has been sent and
+ * there are still requests that haven't been acked.
+ */
+ if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+ !(qp->s_flags &
+ (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+ hfi1_add_retry_timer(qp);
+
+ while (qp->s_last != qp->s_acked) {
+ u32 s_last;
+
+ wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+ if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+ cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+ break;
+ s_last = qp->s_last;
+ if (++s_last >= qp->s_size)
+ s_last = 0;
+ qp->s_last = s_last;
+ /* see post_send() */
+ barrier();
+ for (i = 0; i < wqe->wr.num_sge; i++) {
+ struct rvt_sge *sge = &wqe->sg_list[i];
+
+ rvt_put_mr(sge->mr);
+ }
+ /* Post a send completion queue entry if requested. */
+ if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+ wc.byte_len = wqe->length;
+ wc.qp = &qp->ibqp;
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
+ }
+ }
+ /*
+ * If we were waiting for sends to complete before re-sending,
+ * and they are now complete, restart sending.
+ */
+ trace_hfi1_sendcomplete(qp, psn);
+ if (qp->s_flags & RVT_S_WAIT_PSN &&
+ cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+ qp->s_flags &= ~RVT_S_WAIT_PSN;
+ qp->s_sending_psn = qp->s_psn;
+ qp->s_sending_hpsn = qp->s_psn - 1;
+ hfi1_schedule_send(qp);
+ }
+}
+
+static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
+{
+ qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to hfi1_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp)
+{
+ struct ib_wc wc;
+ unsigned i;
+
+ /*
+ * Don't decrement refcount and don't generate a
+ * completion if the SWQE is being resent until the send
+ * is finished.
+ */
+ if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
+ cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+ u32 s_last;
+
+ for (i = 0; i < wqe->wr.num_sge; i++) {
+ struct rvt_sge *sge = &wqe->sg_list[i];
+
+ rvt_put_mr(sge->mr);
+ }
+ s_last = qp->s_last;
+ if (++s_last >= qp->s_size)
+ s_last = 0;
+ qp->s_last = s_last;
+ /* see post_send() */
+ barrier();
+ /* Post a send completion queue entry if requested. */
+ if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+ wc.byte_len = wqe->length;
+ wc.qp = &qp->ibqp;
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
+ }
+ } else {
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ this_cpu_inc(*ibp->rvp.rc_delayed_comp);
+ /*
+ * If send progress not running attempt to progress
+ * SDMA queue.
+ */
+ if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
+ struct sdma_engine *engine;
+ u8 sc5;
+
+ /* For now use sc to find engine */
+ sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+ engine = qp_to_sdma_engine(qp, sc5);
+ sdma_engine_progress_schedule(engine);
+ }
+ }
+
+ qp->s_retry = qp->s_retry_cnt;
+ update_last_psn(qp, wqe->lpsn);
+
+ /*
+ * If we are completing a request which is in the process of
+ * being resent, we can stop re-sending it since we know the
+ * responder has already seen it.
+ */
+ if (qp->s_acked == qp->s_cur) {
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ qp->s_acked = qp->s_cur;
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+ if (qp->s_acked != qp->s_tail) {
+ qp->s_state = OP(SEND_LAST);
+ qp->s_psn = wqe->psn;
+ }
+ } else {
+ if (++qp->s_acked >= qp->s_size)
+ qp->s_acked = 0;
+ if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+ qp->s_draining = 0;
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ }
+ return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * May be called at interrupt level, with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+ u64 val, struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_ibport *ibp;
+ enum ib_wc_status status;
+ struct rvt_swqe *wqe;
+ int ret = 0;
+ u32 ack_psn;
+ int diff;
+ unsigned long to;
+
+ /*
+ * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+ * requests and implicitly NAK RDMA read and atomic requests issued
+ * before the NAK'ed request. The MSN won't include the NAK'ed
+ * request but will include an ACK'ed request(s).
+ */
+ ack_psn = psn;
+ if (aeth >> 29)
+ ack_psn--;
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+ /*
+ * The MSN might be for a later WQE than the PSN indicates so
+ * only complete WQEs that the PSN finishes.
+ */
+ while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
+ /*
+ * RDMA_READ_RESPONSE_ONLY is a special case since
+ * we want to generate completion events for everything
+ * before the RDMA read, copy the data, then generate
+ * the completion for the read.
+ */
+ if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+ opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+ diff == 0) {
+ ret = 1;
+ goto bail_stop;
+ }
+ /*
+ * If this request is a RDMA read or atomic, and the ACK is
+ * for a later operation, this ACK NAKs the RDMA read or
+ * atomic. In other words, only a RDMA_READ_LAST or ONLY
+ * can ACK a RDMA read and likewise for atomic ops. Note
+ * that the NAK case can only happen if relaxed ordering is
+ * used and requests are sent after an RDMA read or atomic
+ * is sent but before the response is received.
+ */
+ if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+ (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+ ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+ (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+ /* Retry this request. */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ restart_rc(qp, qp->s_last_psn + 1, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ atomic_inc(&qp->refcount);
+ list_add_tail(&qp->rspwait,
+ &rcd->qp_wait_list);
+ }
+ }
+ /*
+ * No need to process the ACK/NAK since we are
+ * restarting an earlier request.
+ */
+ goto bail_stop;
+ }
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ u64 *vaddr = wqe->sg_list[0].vaddr;
+ *vaddr = val;
+ }
+ if (qp->s_num_rd_atomic &&
+ (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+ qp->s_num_rd_atomic--;
+ /* Restart sending task if fence is complete */
+ if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+ !qp->s_num_rd_atomic) {
+ qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+ RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+ qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
+ RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+ }
+ wqe = do_rc_completion(qp, wqe, ibp);
+ if (qp->s_acked == qp->s_tail)
+ break;
+ }
+
+ switch (aeth >> 29) {
+ case 0: /* ACK */
+ this_cpu_inc(*ibp->rvp.rc_acks);
+ if (qp->s_acked != qp->s_tail) {
+ /*
+ * We are expecting more ACKs so
+ * mod the retry timer.
+ */
+ hfi1_mod_retry_timer(qp);
+ /*
+ * We can stop re-sending the earlier packets and
+ * continue with the next packet the receiver wants.
+ */
+ if (cmp_psn(qp->s_psn, psn) <= 0)
+ reset_psn(qp, psn + 1);
+ } else {
+ /* No more acks - kill all timers */
+ hfi1_stop_rc_timers(qp);
+ if (cmp_psn(qp->s_psn, psn) <= 0) {
+ qp->s_state = OP(SEND_LAST);
+ qp->s_psn = psn + 1;
+ }
+ }
+ if (qp->s_flags & RVT_S_WAIT_ACK) {
+ qp->s_flags &= ~RVT_S_WAIT_ACK;
+ hfi1_schedule_send(qp);
+ }
+ hfi1_get_credit(qp, aeth);
+ qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+ qp->s_retry = qp->s_retry_cnt;
+ update_last_psn(qp, psn);
+ return 1;
+
+ case 1: /* RNR NAK */
+ ibp->rvp.n_rnr_naks++;
+ if (qp->s_acked == qp->s_tail)
+ goto bail_stop;
+ if (qp->s_flags & RVT_S_WAIT_RNR)
+ goto bail_stop;
+ if (qp->s_rnr_retry == 0) {
+ status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto class_b;
+ }
+ if (qp->s_rnr_retry_cnt < 7)
+ qp->s_rnr_retry--;
+
+ /* The last valid PSN is the previous PSN. */
+ update_last_psn(qp, psn - 1);
+
+ ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
+
+ reset_psn(qp, psn);
+
+ qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
+ hfi1_stop_rc_timers(qp);
+ to =
+ ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
+ HFI1_AETH_CREDIT_MASK];
+ hfi1_add_rnr_timer(qp, to);
+ return 0;
+
+ case 3: /* NAK */
+ if (qp->s_acked == qp->s_tail)
+ goto bail_stop;
+ /* The last valid PSN is the previous PSN. */
+ update_last_psn(qp, psn - 1);
+ switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
+ HFI1_AETH_CREDIT_MASK) {
+ case 0: /* PSN sequence error */
+ ibp->rvp.n_seq_naks++;
+ /*
+ * Back up to the responder's expected PSN.
+ * Note that we might get a NAK in the middle of an
+ * RDMA READ response which terminates the RDMA
+ * READ.
+ */
+ restart_rc(qp, psn, 0);
+ hfi1_schedule_send(qp);
+ break;
+
+ case 1: /* Invalid Request */
+ status = IB_WC_REM_INV_REQ_ERR;
+ ibp->rvp.n_other_naks++;
+ goto class_b;
+
+ case 2: /* Remote Access Error */
+ status = IB_WC_REM_ACCESS_ERR;
+ ibp->rvp.n_other_naks++;
+ goto class_b;
+
+ case 3: /* Remote Operation Error */
+ status = IB_WC_REM_OP_ERR;
+ ibp->rvp.n_other_naks++;
+class_b:
+ if (qp->s_last == qp->s_acked) {
+ hfi1_send_complete(qp, wqe, status);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ }
+ break;
+
+ default:
+ /* Ignore other reserved NAK error codes */
+ goto reserved;
+ }
+ qp->s_retry = qp->s_retry_cnt;
+ qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+ goto bail_stop;
+
+ default: /* 2: reserved */
+reserved:
+ /* Ignore reserved NAK codes. */
+ goto bail_stop;
+ }
+ /* cannot be reached */
+bail_stop:
+ hfi1_stop_rc_timers(qp);
+ return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
+ struct hfi1_ctxtdata *rcd)
+{
+ struct rvt_swqe *wqe;
+
+ /* Remove QP from retry timer */
+ hfi1_stop_rc_timers(qp);
+
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+ while (cmp_psn(psn, wqe->lpsn) > 0) {
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+ break;
+ wqe = do_rc_completion(qp, wqe, ibp);
+ }
+
+ ibp->rvp.n_rdma_seq++;
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ restart_rc(qp, qp->s_last_psn + 1, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ atomic_inc(&qp->refcount);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+/**
+ * rc_rcv_resp - process an incoming RC response packet
+ * @ibp: the port this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void rc_rcv_resp(struct hfi1_ibport *ibp,
+ struct hfi1_other_headers *ohdr,
+ void *data, u32 tlen, struct rvt_qp *qp,
+ u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
+ struct hfi1_ctxtdata *rcd)
+{
+ struct rvt_swqe *wqe;
+ enum ib_wc_status status;
+ unsigned long flags;
+ int diff;
+ u32 pad;
+ u32 aeth;
+ u64 val;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ trace_hfi1_ack(qp, psn);
+
+ /* Ignore invalid responses. */
+ smp_read_barrier_depends(); /* see post_one_send */
+ if (cmp_psn(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0)
+ goto ack_done;
+
+ /* Ignore duplicate responses. */
+ diff = cmp_psn(psn, qp->s_last_psn);
+ if (unlikely(diff <= 0)) {
+ /* Update credits for "ghost" ACKs */
+ if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ if ((aeth >> 29) == 0)
+ hfi1_get_credit(qp, aeth);
+ }
+ goto ack_done;
+ }
+
+ /*
+ * Skip everything other than the PSN we expect, if we are waiting
+ * for a reply to a restarted RDMA read or atomic op.
+ */
+ if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+ if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+ goto ack_done;
+ qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+ }
+
+ if (unlikely(qp->s_acked == qp->s_tail))
+ goto ack_done;
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ status = IB_WC_SUCCESS;
+
+ switch (opcode) {
+ case OP(ACKNOWLEDGE):
+ case OP(ATOMIC_ACKNOWLEDGE):
+ case OP(RDMA_READ_RESPONSE_FIRST):
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+ __be32 *p = ohdr->u.at.atomic_ack_eth;
+
+ val = ((u64)be32_to_cpu(p[0]) << 32) |
+ be32_to_cpu(p[1]);
+ } else {
+ val = 0;
+ }
+ if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+ opcode != OP(RDMA_READ_RESPONSE_FIRST))
+ goto ack_done;
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+ goto ack_op_err;
+ /*
+ * If this is a response to a resent RDMA read, we
+ * have to be careful to copy the data to the right
+ * location.
+ */
+ qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+ wqe, psn, pmtu);
+ goto read_middle;
+
+ case OP(RDMA_READ_RESPONSE_MIDDLE):
+ /* no AETH, no ACK */
+ if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+ goto ack_seq_err;
+ if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+ goto ack_op_err;
+read_middle:
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto ack_len_err;
+ if (unlikely(pmtu >= qp->s_rdma_read_len))
+ goto ack_len_err;
+
+ /*
+ * We got a response so update the timeout.
+ * 4.096 usec. * (1 << qp->timeout)
+ */
+ qp->s_flags |= RVT_S_TIMER;
+ mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
+ if (qp->s_flags & RVT_S_WAIT_ACK) {
+ qp->s_flags &= ~RVT_S_WAIT_ACK;
+ hfi1_schedule_send(qp);
+ }
+
+ if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+ qp->s_retry = qp->s_retry_cnt;
+
+ /*
+ * Update the RDMA receive state but do the copy w/o
+ * holding the locks and blocking interrupts.
+ */
+ qp->s_rdma_read_len -= pmtu;
+ update_last_psn(qp, psn);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0, 0);
+ goto bail;
+
+ case OP(RDMA_READ_RESPONSE_ONLY):
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+ goto ack_done;
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /*
+ * Check that the data size is >= 0 && <= pmtu.
+ * Remember to account for ICRC (4).
+ */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto ack_len_err;
+ /*
+ * If this is a response to a resent RDMA read, we
+ * have to be careful to copy the data to the right
+ * location.
+ */
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+ wqe, psn, pmtu);
+ goto read_last;
+
+ case OP(RDMA_READ_RESPONSE_LAST):
+ /* ACKs READ req. */
+ if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+ goto ack_seq_err;
+ if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+ goto ack_op_err;
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /*
+ * Check that the data size is >= 1 && <= pmtu.
+ * Remember to account for ICRC (4).
+ */
+ if (unlikely(tlen <= (hdrsize + pad + 4)))
+ goto ack_len_err;
+read_last:
+ tlen -= hdrsize + pad + 4;
+ if (unlikely(tlen != qp->s_rdma_read_len))
+ goto ack_len_err;
+ aeth = be32_to_cpu(ohdr->u.aeth);
+ hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0, 0);
+ WARN_ON(qp->s_rdma_read_sge.num_sge);
+ (void)do_rc_ack(qp, aeth, psn,
+ OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+ goto ack_done;
+ }
+
+ack_op_err:
+ status = IB_WC_LOC_QP_OP_ERR;
+ goto ack_err;
+
+ack_seq_err:
+ rdma_seq_err(qp, ibp, psn, rcd);
+ goto ack_done;
+
+ack_len_err:
+ status = IB_WC_LOC_LEN_ERR;
+ack_err:
+ if (qp->s_last == qp->s_acked) {
+ hfi1_send_complete(qp, wqe, status);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ }
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+ return;
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp)
+{
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_NAK;
+ atomic_inc(&qp->refcount);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+static inline void rc_cancel_ack(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ priv->r_adefered = 0;
+ if (list_empty(&qp->rspwait))
+ return;
+ list_del_init(&qp->rspwait);
+ qp->r_flags &= ~RVT_R_RSP_NAK;
+ if (atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+/**
+ * rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from hfi1_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
+ struct rvt_qp *qp, u32 opcode, u32 psn,
+ int diff, struct hfi1_ctxtdata *rcd)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ u8 i, prev;
+ int old_req;
+
+ trace_hfi1_rcv_error(qp, psn);
+ if (diff > 0) {
+ /*
+ * Packet sequence error.
+ * A NAK will ACK earlier sends and RDMA writes.
+ * Don't queue the NAK if we already sent one.
+ */
+ if (!qp->r_nak_state) {
+ ibp->rvp.n_rc_seqnak++;
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ /* Use the expected PSN. */
+ qp->r_ack_psn = qp->r_psn;
+ /*
+ * Wait to send the sequence NAK until all packets
+ * in the receive queue have been processed.
+ * Otherwise, we end up propagating congestion.
+ */
+ rc_defered_ack(rcd, qp);
+ }
+ goto done;
+ }
+
+ /*
+ * Handle a duplicate request. Don't re-execute SEND, RDMA
+ * write or atomic op. Don't NAK errors, just silently drop
+ * the duplicate request. Note that r_sge, r_len, and
+ * r_rcv_len may be in use so don't modify them.
+ *
+ * We are supposed to ACK the earliest duplicate PSN but we
+ * can coalesce an outstanding duplicate ACK. We have to
+ * send the earliest so that RDMA reads can be restarted at
+ * the requester's expected PSN.
+ *
+ * First, find where this duplicate PSN falls within the
+ * ACKs previously sent.
+ * old_req is true if there is an older response that is scheduled
+ * to be sent before sending this one.
+ */
+ e = NULL;
+ old_req = 1;
+ ibp->rvp.n_rc_dupreq++;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ for (i = qp->r_head_ack_queue; ; i = prev) {
+ if (i == qp->s_tail_ack_queue)
+ old_req = 0;
+ if (i)
+ prev = i - 1;
+ else
+ prev = HFI1_MAX_RDMA_ATOMIC;
+ if (prev == qp->r_head_ack_queue) {
+ e = NULL;
+ break;
+ }
+ e = &qp->s_ack_queue[prev];
+ if (!e->opcode) {
+ e = NULL;
+ break;
+ }
+ if (cmp_psn(psn, e->psn) >= 0) {
+ if (prev == qp->s_tail_ack_queue &&
+ cmp_psn(psn, e->lpsn) <= 0)
+ old_req = 0;
+ break;
+ }
+ }
+ switch (opcode) {
+ case OP(RDMA_READ_REQUEST): {
+ struct ib_reth *reth;
+ u32 offset;
+ u32 len;
+
+ /*
+ * If we didn't find the RDMA read request in the ack queue,
+ * we can ignore this request.
+ */
+ if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+ goto unlock_done;
+ /* RETH comes after BTH */
+ reth = &ohdr->u.rc.reth;
+ /*
+ * Address range must be a subset of the original
+ * request and start on pmtu boundaries.
+ * We reuse the old ack_queue slot since the requester
+ * should not back up and request an earlier PSN for the
+ * same request.
+ */
+ offset = delta_psn(psn, e->psn) * qp->pmtu;
+ len = be32_to_cpu(reth->length);
+ if (unlikely(offset + len != e->rdma_sge.sge_length))
+ goto unlock_done;
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ if (len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+ IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto unlock_done;
+ } else {
+ e->rdma_sge.vaddr = NULL;
+ e->rdma_sge.length = 0;
+ e->rdma_sge.sge_length = 0;
+ }
+ e->psn = psn;
+ if (old_req)
+ goto unlock_done;
+ qp->s_tail_ack_queue = prev;
+ break;
+ }
+
+ case OP(COMPARE_SWAP):
+ case OP(FETCH_ADD): {
+ /*
+ * If we didn't find the atomic request in the ack queue
+ * or the send tasklet is already backed up to send an
+ * earlier entry, we can ignore this request.
+ */
+ if (!e || e->opcode != (u8)opcode || old_req)
+ goto unlock_done;
+ qp->s_tail_ack_queue = prev;
+ break;
+ }
+
+ default:
+ /*
+ * Ignore this operation if it doesn't request an ACK
+ * or an earlier RDMA read or atomic is going to be resent.
+ */
+ if (!(psn & IB_BTH_REQ_ACK) || old_req)
+ goto unlock_done;
+ /*
+ * Resend the most recent ACK if this request is
+ * after all the previous RDMA reads and atomics.
+ */
+ if (i == qp->r_head_ack_queue) {
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ qp->r_nak_state = 0;
+ qp->r_ack_psn = qp->r_psn - 1;
+ goto send_ack;
+ }
+
+ /*
+ * Resend the RDMA read or atomic op which
+ * ACKs this duplicate request.
+ */
+ qp->s_tail_ack_queue = i;
+ break;
+ }
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ qp->r_nak_state = 0;
+ hfi1_schedule_send(qp);
+
+unlock_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return 1;
+
+send_ack:
+ return 0;
+}
+
+void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
+{
+ unsigned long flags;
+ int lastwqe;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ lastwqe = rvt_error_qp(qp, err);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+}
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
+{
+ unsigned next;
+
+ next = n + 1;
+ if (next > HFI1_MAX_RDMA_ATOMIC)
+ next = 0;
+ qp->s_tail_ack_queue = next;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
+ u32 lqpn, u32 rqpn, u8 svc_type)
+{
+ struct opa_hfi1_cong_log_event_internal *cc_event;
+ unsigned long flags;
+
+ if (sl >= OPA_MAX_SLS)
+ return;
+
+ spin_lock_irqsave(&ppd->cc_log_lock, flags);
+
+ ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
+ ppd->threshold_event_counter++;
+
+ cc_event = &ppd->cc_events[ppd->cc_log_idx++];
+ if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
+ ppd->cc_log_idx = 0;
+ cc_event->lqpn = lqpn & RVT_QPN_MASK;
+ cc_event->rqpn = rqpn & RVT_QPN_MASK;
+ cc_event->sl = sl;
+ cc_event->svc_type = svc_type;
+ cc_event->rlid = rlid;
+ /* keep timestamp in units of 1.024 usec */
+ cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
+
+ spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
+}
+
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
+ u32 rqpn, u8 svc_type)
+{
+ struct cca_timer *cca_timer;
+ u16 ccti, ccti_incr, ccti_timer, ccti_limit;
+ u8 trigger_threshold;
+ struct cc_state *cc_state;
+ unsigned long flags;
+
+ if (sl >= OPA_MAX_SLS)
+ return;
+
+ cc_state = get_cc_state(ppd);
+
+ if (!cc_state)
+ return;
+
+ /*
+ * 1) increase CCTI (for this SL)
+ * 2) select IPG (i.e., call set_link_ipg())
+ * 3) start timer
+ */
+ ccti_limit = cc_state->cct.ccti_limit;
+ ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
+ ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+ trigger_threshold =
+ cc_state->cong_setting.entries[sl].trigger_threshold;
+
+ spin_lock_irqsave(&ppd->cca_timer_lock, flags);
+
+ cca_timer = &ppd->cca_timer[sl];
+ if (cca_timer->ccti < ccti_limit) {
+ if (cca_timer->ccti + ccti_incr <= ccti_limit)
+ cca_timer->ccti += ccti_incr;
+ else
+ cca_timer->ccti = ccti_limit;
+ set_link_ipg(ppd);
+ }
+
+ ccti = cca_timer->ccti;
+
+ if (!hrtimer_active(&cca_timer->hrtimer)) {
+ /* ccti_timer is in units of 1.024 usec */
+ unsigned long nsec = 1024 * ccti_timer;
+
+ hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
+ HRTIMER_MODE_REL);
+ }
+
+ spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
+
+ if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
+ log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
+
+/**
+ * hfi1_rc_rcv - process an incoming RC packet
+ * @rcd: the context pointer
+ * @hdr: the header of this packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * May be called at interrupt level.
+ */
+void hfi1_rc_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct hfi1_ib_header *hdr = packet->hdr;
+ u32 rcv_flags = packet->rcv_flags;
+ void *data = packet->ebuf;
+ u32 tlen = packet->tlen;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_other_headers *ohdr = packet->ohdr;
+ u32 bth0, opcode;
+ u32 hdrsize = packet->hlen;
+ u32 psn;
+ u32 pad;
+ struct ib_wc wc;
+ u32 pmtu = qp->pmtu;
+ int diff;
+ struct ib_reth *reth;
+ unsigned long flags;
+ int ret, is_fecn = 0;
+ int copy_last = 0;
+ u32 rkey;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
+ return;
+
+ is_fecn = process_ecn(qp, packet, false);
+
+ psn = be32_to_cpu(ohdr->bth[2]);
+ opcode = (bth0 >> 24) & 0xff;
+
+ /*
+ * Process responses (ACKs) before anything else. Note that the
+ * packet sequence number will be for something in the send work
+ * queue rather than the expected receive packet sequence number.
+ * In other words, this QP is the requester.
+ */
+ if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+ rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
+ hdrsize, pmtu, rcd);
+ if (is_fecn)
+ goto send_ack;
+ return;
+ }
+
+ /* Compute 24 bits worth of difference. */
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+ return;
+ goto send_ack;
+ }
+
+ /* Check for opcode sequence errors. */
+ switch (qp->r_state) {
+ case OP(SEND_FIRST):
+ case OP(SEND_MIDDLE):
+ if (opcode == OP(SEND_MIDDLE) ||
+ opcode == OP(SEND_LAST) ||
+ opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+ opcode == OP(SEND_LAST_WITH_INVALIDATE))
+ break;
+ goto nack_inv;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_MIDDLE):
+ if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+ opcode == OP(RDMA_WRITE_LAST) ||
+ opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+ break;
+ goto nack_inv;
+
+ default:
+ if (opcode == OP(SEND_MIDDLE) ||
+ opcode == OP(SEND_LAST) ||
+ opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+ opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
+ opcode == OP(RDMA_WRITE_MIDDLE) ||
+ opcode == OP(RDMA_WRITE_LAST) ||
+ opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+ goto nack_inv;
+ /*
+ * Note that it is up to the requester to not send a new
+ * RDMA read or atomic operation before receiving an ACK
+ * for the previous operation.
+ */
+ break;
+ }
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ qp_comm_est(qp);
+
+ /* OK, process the packet. */
+ switch (opcode) {
+ case OP(SEND_FIRST):
+ ret = hfi1_rvt_get_rwqe(qp, 0);
+ if (ret < 0)
+ goto nack_op_err;
+ if (!ret)
+ goto rnr_nak;
+ qp->r_rcv_len = 0;
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto nack_inv;
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len))
+ goto nack_inv;
+ hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
+ break;
+
+ case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ /* consume RWQE */
+ ret = hfi1_rvt_get_rwqe(qp, 1);
+ if (ret < 0)
+ goto nack_op_err;
+ if (!ret)
+ goto rnr_nak;
+ goto send_last_imm;
+
+ case OP(SEND_ONLY):
+ case OP(SEND_ONLY_WITH_IMMEDIATE):
+ case OP(SEND_ONLY_WITH_INVALIDATE):
+ ret = hfi1_rvt_get_rwqe(qp, 0);
+ if (ret < 0)
+ goto nack_op_err;
+ if (!ret)
+ goto rnr_nak;
+ qp->r_rcv_len = 0;
+ if (opcode == OP(SEND_ONLY))
+ goto no_immediate_data;
+ if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
+ goto send_last_inv;
+ /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
+ case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+ wc.ex.imm_data = ohdr->u.imm_data;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ goto send_last;
+ case OP(SEND_LAST_WITH_INVALIDATE):
+send_last_inv:
+ rkey = be32_to_cpu(ohdr->u.ieth);
+ if (rvt_invalidate_rkey(qp, rkey))
+ goto no_immediate_data;
+ wc.ex.invalidate_rkey = rkey;
+ wc.wc_flags = IB_WC_WITH_INVALIDATE;
+ goto send_last;
+ case OP(RDMA_WRITE_LAST):
+ copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
+ /* fall through */
+ case OP(SEND_LAST):
+no_immediate_data:
+ wc.wc_flags = 0;
+ wc.ex.imm_data = 0;
+send_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (bth0 >> 20) & 3;
+ /* Check for invalid length. */
+ /* LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto nack_inv;
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ wc.byte_len = tlen + qp->r_rcv_len;
+ if (unlikely(wc.byte_len > qp->r_len))
+ goto nack_inv;
+ hfi1_copy_sge(&qp->r_sge, data, tlen, 1, copy_last);
+ rvt_put_ss(&qp->r_sge);
+ qp->r_msn++;
+ if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+ break;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+ opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ else
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ /*
+ * It seems that IB mandates the presence of an SL in a
+ * work completion only for the UD transport (see section
+ * 11.4.2 of IBTA Vol. 1).
+ *
+ * However, the way the SL is chosen below is consistent
+ * with the way that IB/qib works and is trying avoid
+ * introducing incompatibilities.
+ *
+ * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+ */
+ wc.sl = qp->remote_ah_attr.sl;
+ /* zero fields that are N/A */
+ wc.vendor_err = 0;
+ wc.pkey_index = 0;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+ /* Signal completion event if the solicited bit is set. */
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+ (bth0 & IB_BTH_SOLICITED) != 0);
+ break;
+
+ case OP(RDMA_WRITE_ONLY):
+ copy_last = 1;
+ /* fall through */
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto nack_inv;
+ /* consume RWQE */
+ reth = &ohdr->u.rc.reth;
+ qp->r_len = be32_to_cpu(reth->length);
+ qp->r_rcv_len = 0;
+ qp->r_sge.sg_list = NULL;
+ if (qp->r_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ /* Check rkey & NAK */
+ ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_WRITE);
+ if (unlikely(!ok))
+ goto nack_acc;
+ qp->r_sge.num_sge = 1;
+ } else {
+ qp->r_sge.num_sge = 0;
+ qp->r_sge.sge.mr = NULL;
+ qp->r_sge.sge.vaddr = NULL;
+ qp->r_sge.sge.length = 0;
+ qp->r_sge.sge.sge_length = 0;
+ }
+ if (opcode == OP(RDMA_WRITE_FIRST))
+ goto send_middle;
+ else if (opcode == OP(RDMA_WRITE_ONLY))
+ goto no_immediate_data;
+ ret = hfi1_rvt_get_rwqe(qp, 1);
+ if (ret < 0)
+ goto nack_op_err;
+ if (!ret)
+ goto rnr_nak;
+ wc.ex.imm_data = ohdr->u.rc.imm_data;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ goto send_last;
+
+ case OP(RDMA_READ_REQUEST): {
+ struct rvt_ack_entry *e;
+ u32 len;
+ u8 next;
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto nack_inv;
+ next = qp->r_head_ack_queue + 1;
+ /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
+ if (next > HFI1_MAX_RDMA_ATOMIC)
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlck;
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ reth = &ohdr->u.rc.reth;
+ len = be32_to_cpu(reth->length);
+ if (len) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ /* Check rkey & NAK */
+ ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+ rkey, IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto nack_acc_unlck;
+ /*
+ * Update the next expected PSN. We add 1 later
+ * below, so only add the remainder here.
+ */
+ if (len > pmtu)
+ qp->r_psn += (len - 1) / pmtu;
+ } else {
+ e->rdma_sge.mr = NULL;
+ e->rdma_sge.vaddr = NULL;
+ e->rdma_sge.length = 0;
+ e->rdma_sge.sge_length = 0;
+ }
+ e->opcode = opcode;
+ e->sent = 0;
+ e->psn = psn;
+ e->lpsn = qp->r_psn;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn++;
+ qp->r_state = opcode;
+ qp->r_nak_state = 0;
+ qp->r_head_ack_queue = next;
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+ }
+
+ case OP(COMPARE_SWAP):
+ case OP(FETCH_ADD): {
+ struct ib_atomic_eth *ateth;
+ struct rvt_ack_entry *e;
+ u64 vaddr;
+ atomic64_t *maddr;
+ u64 sdata;
+ u32 rkey;
+ u8 next;
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ goto nack_inv;
+ next = qp->r_head_ack_queue + 1;
+ if (next > HFI1_MAX_RDMA_ATOMIC)
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlck;
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ ateth = &ohdr->u.atomic_eth;
+ vaddr = ((u64)be32_to_cpu(ateth->vaddr[0]) << 32) |
+ be32_to_cpu(ateth->vaddr[1]);
+ if (unlikely(vaddr & (sizeof(u64) - 1)))
+ goto nack_inv_unlck;
+ rkey = be32_to_cpu(ateth->rkey);
+ /* Check rkey & NAK */
+ if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+ vaddr, rkey,
+ IB_ACCESS_REMOTE_ATOMIC)))
+ goto nack_acc_unlck;
+ /* Perform atomic OP and save result. */
+ maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+ sdata = be64_to_cpu(ateth->swap_data);
+ e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+ (u64)atomic64_add_return(sdata, maddr) - sdata :
+ (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+ be64_to_cpu(ateth->compare_data),
+ sdata);
+ rvt_put_mr(qp->r_sge.sge.mr);
+ qp->r_sge.num_sge = 0;
+ e->opcode = opcode;
+ e->sent = 0;
+ e->psn = psn;
+ e->lpsn = psn;
+ qp->r_msn++;
+ qp->r_psn++;
+ qp->r_state = opcode;
+ qp->r_nak_state = 0;
+ qp->r_head_ack_queue = next;
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+ }
+
+ default:
+ /* NAK unknown opcodes. */
+ goto nack_inv;
+ }
+ qp->r_psn++;
+ qp->r_state = opcode;
+ qp->r_ack_psn = psn;
+ qp->r_nak_state = 0;
+ /* Send an ACK if requested or required. */
+ if (psn & IB_BTH_REQ_ACK) {
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (packet->numpkt == 0) {
+ rc_cancel_ack(qp);
+ goto send_ack;
+ }
+ if (priv->r_adefered >= HFI1_PSN_CREDIT) {
+ rc_cancel_ack(qp);
+ goto send_ack;
+ }
+ if (unlikely(is_fecn)) {
+ rc_cancel_ack(qp);
+ goto send_ack;
+ }
+ priv->r_adefered++;
+ rc_defered_ack(rcd, qp);
+ }
+ return;
+
+rnr_nak:
+ qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue RNR NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+
+nack_op_err:
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+
+nack_inv_unlck:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+
+nack_acc_unlck:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+ hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ hfi1_send_rc_ack(rcd, qp, is_fecn);
+}
+
+void hfi1_rc_hdrerr(
+ struct hfi1_ctxtdata *rcd,
+ struct hfi1_ib_header *hdr,
+ u32 rcv_flags,
+ struct rvt_qp *qp)
+{
+ int has_grh = rcv_flags & HFI1_HAS_GRH;
+ struct hfi1_other_headers *ohdr;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ int diff;
+ u32 opcode;
+ u32 psn, bth0;
+
+ /* Check for GRH */
+ ohdr = &hdr->u.oth;
+ if (has_grh)
+ ohdr = &hdr->u.l.oth;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
+ return;
+
+ psn = be32_to_cpu(ohdr->bth[2]);
+ opcode = (bth0 >> 24) & 0xff;
+
+ /* Only deal with RDMA Writes for now */
+ if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+ diff = delta_psn(psn, qp->r_psn);
+ if (!qp->r_nak_state && diff >= 0) {
+ ibp->rvp.n_rc_seqnak++;
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ /* Use the expected PSN. */
+ qp->r_ack_psn = qp->r_psn;
+ /*
+ * Wait to send the sequence
+ * NAK until all packets
+ * in the receive queue have
+ * been processed.
+ * Otherwise, we end up
+ * propagating congestion.
+ */
+ rc_defered_ack(rcd, qp);
+ } /* Out of sequence NAK */
+ } /* QP Request NAKs */
+}
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
new file mode 100644
index 000000000000..48d5094f98e2
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -0,0 +1,1002 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+#include "trace.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+const u32 ib_hfi1_rnr_table[32] = {
+ 655360, /* 00: 655.36 */
+ 10, /* 01: .01 */
+ 20, /* 02 .02 */
+ 30, /* 03: .03 */
+ 40, /* 04: .04 */
+ 60, /* 05: .06 */
+ 80, /* 06: .08 */
+ 120, /* 07: .12 */
+ 160, /* 08: .16 */
+ 240, /* 09: .24 */
+ 320, /* 0A: .32 */
+ 480, /* 0B: .48 */
+ 640, /* 0C: .64 */
+ 960, /* 0D: .96 */
+ 1280, /* 0E: 1.28 */
+ 1920, /* 0F: 1.92 */
+ 2560, /* 10: 2.56 */
+ 3840, /* 11: 3.84 */
+ 5120, /* 12: 5.12 */
+ 7680, /* 13: 7.68 */
+ 10240, /* 14: 10.24 */
+ 15360, /* 15: 15.36 */
+ 20480, /* 16: 20.48 */
+ 30720, /* 17: 30.72 */
+ 40960, /* 18: 40.96 */
+ 61440, /* 19: 61.44 */
+ 81920, /* 1A: 81.92 */
+ 122880, /* 1B: 122.88 */
+ 163840, /* 1C: 163.84 */
+ 245760, /* 1D: 245.76 */
+ 327680, /* 1E: 327.68 */
+ 491520 /* 1F: 491.52 */
+};
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
+{
+ int i, j, ret;
+ struct ib_wc wc;
+ struct rvt_lkey_table *rkt;
+ struct rvt_pd *pd;
+ struct rvt_sge_state *ss;
+
+ rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
+ pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+ ss = &qp->r_sge;
+ ss->sg_list = qp->r_sg_list;
+ qp->r_len = 0;
+ for (i = j = 0; i < wqe->num_sge; i++) {
+ if (wqe->sg_list[i].length == 0)
+ continue;
+ /* Check LKEY */
+ if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+ &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+ goto bad_lkey;
+ qp->r_len += wqe->sg_list[i].length;
+ j++;
+ }
+ ss->num_sge = j;
+ ss->total_len = qp->r_len;
+ ret = 1;
+ goto bail;
+
+bad_lkey:
+ while (j) {
+ struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+ rvt_put_mr(sge->mr);
+ }
+ ss->num_sge = 0;
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr_id;
+ wc.status = IB_WC_LOC_PROT_ERR;
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ /* Signal solicited completion event. */
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+ ret = 0;
+bail:
+ return ret;
+}
+
+/**
+ * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only)
+{
+ unsigned long flags;
+ struct rvt_rq *rq;
+ struct rvt_rwq *wq;
+ struct rvt_srq *srq;
+ struct rvt_rwqe *wqe;
+ void (*handler)(struct ib_event *, void *);
+ u32 tail;
+ int ret;
+
+ if (qp->ibqp.srq) {
+ srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
+ handler = srq->ibsrq.event_handler;
+ rq = &srq->rq;
+ } else {
+ srq = NULL;
+ handler = NULL;
+ rq = &qp->r_rq;
+ }
+
+ spin_lock_irqsave(&rq->lock, flags);
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ ret = 0;
+ goto unlock;
+ }
+
+ wq = rq->wq;
+ tail = wq->tail;
+ /* Validate tail before using it since it is user writable. */
+ if (tail >= rq->size)
+ tail = 0;
+ if (unlikely(tail == wq->head)) {
+ ret = 0;
+ goto unlock;
+ }
+ /* Make sure entry is read after head index is read. */
+ smp_rmb();
+ wqe = rvt_get_rwqe_ptr(rq, tail);
+ /*
+ * Even though we update the tail index in memory, the verbs
+ * consumer is not supposed to post more entries until a
+ * completion is generated.
+ */
+ if (++tail >= rq->size)
+ tail = 0;
+ wq->tail = tail;
+ if (!wr_id_only && !init_sge(qp, wqe)) {
+ ret = -1;
+ goto unlock;
+ }
+ qp->r_wr_id = wqe->wr_id;
+
+ ret = 1;
+ set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
+ if (handler) {
+ u32 n;
+
+ /*
+ * Validate head pointer value and compute
+ * the number of remaining WQEs.
+ */
+ n = wq->head;
+ if (n >= rq->size)
+ n = 0;
+ if (n < tail)
+ n += rq->size - tail;
+ else
+ n -= tail;
+ if (n < srq->limit) {
+ struct ib_event ev;
+
+ srq->limit = 0;
+ spin_unlock_irqrestore(&rq->lock, flags);
+ ev.device = qp->ibqp.device;
+ ev.element.srq = qp->ibqp.srq;
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ handler(&ev, srq->ibsrq.srq_context);
+ goto bail;
+ }
+ }
+unlock:
+ spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+ return ret;
+}
+
+static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
+{
+ if (!index) {
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ return cpu_to_be64(ppd->guid);
+ }
+ return ibp->guids[index - 1];
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+ return (gid->global.interface_id == id &&
+ (gid->global.subnet_prefix == gid_prefix ||
+ gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the hfi1_migrate_qp() call.
+ */
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+ int has_grh, struct rvt_qp *qp, u32 bth0)
+{
+ __be64 guid;
+ unsigned long flags;
+ u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+
+ if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
+ if (!has_grh) {
+ if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
+ goto err;
+ } else {
+ if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
+ goto err;
+ guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
+ if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
+ guid))
+ goto err;
+ if (!gid_ok(
+ &hdr->u.l.grh.sgid,
+ qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
+ qp->alt_ah_attr.grh.dgid.global.interface_id))
+ goto err;
+ }
+ if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+ sc5, be16_to_cpu(hdr->lrh[3])))) {
+ hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
+ (u16)bth0,
+ (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+ 0, qp->ibqp.qp_num,
+ be16_to_cpu(hdr->lrh[3]),
+ be16_to_cpu(hdr->lrh[1]));
+ goto err;
+ }
+ /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+ if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
+ ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
+ goto err;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_migrate_qp(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ } else {
+ if (!has_grh) {
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ goto err;
+ } else {
+ if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
+ goto err;
+ guid = get_sguid(ibp,
+ qp->remote_ah_attr.grh.sgid_index);
+ if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
+ guid))
+ goto err;
+ if (!gid_ok(
+ &hdr->u.l.grh.sgid,
+ qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
+ qp->remote_ah_attr.grh.dgid.global.interface_id))
+ goto err;
+ }
+ if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+ sc5, be16_to_cpu(hdr->lrh[3])))) {
+ hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
+ (u16)bth0,
+ (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+ 0, qp->ibqp.qp_num,
+ be16_to_cpu(hdr->lrh[3]),
+ be16_to_cpu(hdr->lrh[1]));
+ goto err;
+ }
+ /* Validate the SLID. See Ch. 9.6.1.5 */
+ if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
+ ppd_from_ibp(ibp)->port != qp->port_num)
+ goto err;
+ if (qp->s_mig_state == IB_MIG_REARM &&
+ !(bth0 & IB_BTH_MIG_REQ))
+ qp->s_mig_state = IB_MIG_ARMED;
+ }
+
+ return 0;
+
+err:
+ return 1;
+}
+
+/**
+ * ruc_loopback - handle UC and RC loopback requests
+ * @sqp: the sending QP
+ *
+ * This is called from hfi1_do_send() to
+ * forward a WQE addressed to the same HFI.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send(). We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ruc_loopback(struct rvt_qp *sqp)
+{
+ struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+ struct rvt_qp *qp;
+ struct rvt_swqe *wqe;
+ struct rvt_sge *sge;
+ unsigned long flags;
+ struct ib_wc wc;
+ u64 sdata;
+ atomic64_t *maddr;
+ enum ib_wc_status send_status;
+ int release;
+ int ret;
+ int copy_last = 0;
+ u32 to;
+ int local_ops = 0;
+
+ rcu_read_lock();
+
+ /*
+ * Note that we check the responder QP state after
+ * checking the requester's state.
+ */
+ qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
+ sqp->remote_qpn);
+
+ spin_lock_irqsave(&sqp->s_lock, flags);
+
+ /* Return if we are already busy processing a work request. */
+ if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
+ !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+ goto unlock;
+
+ sqp->s_flags |= RVT_S_BUSY;
+
+again:
+ smp_read_barrier_depends(); /* see post_one_send() */
+ if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
+ goto clr_busy;
+ wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
+
+ /* Return if it is not OK to start a new work request. */
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
+ goto clr_busy;
+ /* We are in the error state, flush the work request. */
+ send_status = IB_WC_WR_FLUSH_ERR;
+ goto flush_send;
+ }
+
+ /*
+ * We can rely on the entry not changing without the s_lock
+ * being held until we update s_last.
+ * We increment s_cur to indicate s_last is in progress.
+ */
+ if (sqp->s_last == sqp->s_cur) {
+ if (++sqp->s_cur >= sqp->s_size)
+ sqp->s_cur = 0;
+ }
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+ if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
+ qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+ ibp->rvp.n_pkt_drops++;
+ /*
+ * For RC, the requester would timeout and retry so
+ * shortcut the timeouts and just signal too many retries.
+ */
+ if (sqp->ibqp.qp_type == IB_QPT_RC)
+ send_status = IB_WC_RETRY_EXC_ERR;
+ else
+ send_status = IB_WC_SUCCESS;
+ goto serr;
+ }
+
+ memset(&wc, 0, sizeof(wc));
+ send_status = IB_WC_SUCCESS;
+
+ release = 1;
+ sqp->s_sge.sge = wqe->sg_list[0];
+ sqp->s_sge.sg_list = wqe->sg_list + 1;
+ sqp->s_sge.num_sge = wqe->wr.num_sge;
+ sqp->s_len = wqe->length;
+ switch (wqe->wr.opcode) {
+ case IB_WR_REG_MR:
+ goto send_comp;
+
+ case IB_WR_LOCAL_INV:
+ if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+ if (rvt_invalidate_rkey(sqp,
+ wqe->wr.ex.invalidate_rkey))
+ send_status = IB_WC_LOC_PROT_ERR;
+ local_ops = 1;
+ }
+ goto send_comp;
+
+ case IB_WR_SEND_WITH_INV:
+ if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) {
+ wc.wc_flags = IB_WC_WITH_INVALIDATE;
+ wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey;
+ }
+ goto send;
+
+ case IB_WR_SEND_WITH_IMM:
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
+ /* FALLTHROUGH */
+ case IB_WR_SEND:
+send:
+ ret = hfi1_rvt_get_rwqe(qp, 0);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto rnr_nak;
+ break;
+
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto inv_err;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
+ ret = hfi1_rvt_get_rwqe(qp, 1);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto rnr_nak;
+ /* skip copy_last set and qp_access_flags recheck */
+ goto do_write;
+ case IB_WR_RDMA_WRITE:
+ copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto inv_err;
+do_write:
+ if (wqe->length == 0)
+ break;
+ if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+ wqe->rdma_wr.remote_addr,
+ wqe->rdma_wr.rkey,
+ IB_ACCESS_REMOTE_WRITE)))
+ goto acc_err;
+ qp->r_sge.sg_list = NULL;
+ qp->r_sge.num_sge = 1;
+ qp->r_sge.total_len = wqe->length;
+ break;
+
+ case IB_WR_RDMA_READ:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto inv_err;
+ if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+ wqe->rdma_wr.remote_addr,
+ wqe->rdma_wr.rkey,
+ IB_ACCESS_REMOTE_READ)))
+ goto acc_err;
+ release = 0;
+ sqp->s_sge.sg_list = NULL;
+ sqp->s_sge.num_sge = 1;
+ qp->r_sge.sge = wqe->sg_list[0];
+ qp->r_sge.sg_list = wqe->sg_list + 1;
+ qp->r_sge.num_sge = wqe->wr.num_sge;
+ qp->r_sge.total_len = wqe->length;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ goto inv_err;
+ if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+ wqe->atomic_wr.remote_addr,
+ wqe->atomic_wr.rkey,
+ IB_ACCESS_REMOTE_ATOMIC)))
+ goto acc_err;
+ /* Perform atomic OP and save result. */
+ maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+ sdata = wqe->atomic_wr.compare_add;
+ *(u64 *)sqp->s_sge.sge.vaddr =
+ (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+ (u64)atomic64_add_return(sdata, maddr) - sdata :
+ (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+ sdata, wqe->atomic_wr.swap);
+ rvt_put_mr(qp->r_sge.sge.mr);
+ qp->r_sge.num_sge = 0;
+ goto send_comp;
+
+ default:
+ send_status = IB_WC_LOC_QP_OP_ERR;
+ goto serr;
+ }
+
+ sge = &sqp->s_sge.sge;
+ while (sqp->s_len) {
+ u32 len = sqp->s_len;
+
+ if (len > sge->length)
+ len = sge->length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ WARN_ON_ONCE(len == 0);
+ hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (!release)
+ rvt_put_mr(sge->mr);
+ if (--sqp->s_sge.num_sge)
+ *sge = *sqp->s_sge.sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ sqp->s_len -= len;
+ }
+ if (release)
+ rvt_put_ss(&qp->r_sge);
+
+ if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+ goto send_comp;
+
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ else
+ wc.opcode = IB_WC_RECV;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.byte_len = wqe->length;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ wc.sl = qp->remote_ah_attr.sl;
+ wc.port_num = 1;
+ /* Signal completion event if the solicited bit is set. */
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+ wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ ibp->rvp.n_loop_pkts++;
+flush_send:
+ sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+ hfi1_send_complete(sqp, wqe, send_status);
+ if (local_ops) {
+ atomic_dec(&sqp->local_ops_pending);
+ local_ops = 0;
+ }
+ goto again;
+
+rnr_nak:
+ /* Handle RNR NAK */
+ if (qp->ibqp.qp_type == IB_QPT_UC)
+ goto send_comp;
+ ibp->rvp.n_rnr_naks++;
+ /*
+ * Note: we don't need the s_lock held since the BUSY flag
+ * makes this single threaded.
+ */
+ if (sqp->s_rnr_retry == 0) {
+ send_status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto serr;
+ }
+ if (sqp->s_rnr_retry_cnt < 7)
+ sqp->s_rnr_retry--;
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
+ goto clr_busy;
+ to = ib_hfi1_rnr_table[qp->r_min_rnr_timer];
+ hfi1_add_rnr_timer(sqp, to);
+ goto clr_busy;
+
+op_err:
+ send_status = IB_WC_REM_OP_ERR;
+ wc.status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+
+inv_err:
+ send_status = IB_WC_REM_INV_REQ_ERR;
+ wc.status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+
+acc_err:
+ send_status = IB_WC_REM_ACCESS_ERR;
+ wc.status = IB_WC_LOC_PROT_ERR;
+err:
+ /* responder goes to error state */
+ hfi1_rc_error(qp, wc.status);
+
+serr:
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ hfi1_send_complete(sqp, wqe, send_status);
+ if (sqp->ibqp.qp_type == IB_QPT_RC) {
+ int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+ sqp->s_flags &= ~RVT_S_BUSY;
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = sqp->ibqp.device;
+ ev.element.qp = &sqp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+ }
+ goto done;
+ }
+clr_busy:
+ sqp->s_flags &= ~RVT_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+ rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+ struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+ hdr->version_tclass_flow =
+ cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+ (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+ (grh->flow_label << IB_GRH_FLOW_SHIFT));
+ hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+ /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+ hdr->next_hdr = IB_GRH_NEXT_HDR;
+ hdr->hop_limit = grh->hop_limit;
+ /* The SGID is 32-bit aligned. */
+ hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
+ hdr->sgid.global.interface_id =
+ grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
+ ibp->guids[grh->sgid_index - 1] :
+ cpu_to_be64(ppd_from_ibp(ibp)->guid);
+ hdr->dgid = grh->dgid;
+
+ /* GRH header size in 32-bit words. */
+ return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+#define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, hdr.u.oth.bth[2]) / 4)
+
+/**
+ * build_ahg - create ahg in s_ahg
+ * @qp: a pointer to QP
+ * @npsn: the next PSN for the request/response
+ *
+ * This routine handles the AHG by allocating an ahg entry and causing the
+ * copy of the first middle.
+ *
+ * Subsequent middles use the copied entry, editing the
+ * PSN with 1 or 2 edits.
+ */
+static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR))
+ clear_ahg(qp);
+ if (!(qp->s_flags & RVT_S_AHG_VALID)) {
+ /* first middle that needs copy */
+ if (qp->s_ahgidx < 0)
+ qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde);
+ if (qp->s_ahgidx >= 0) {
+ qp->s_ahgpsn = npsn;
+ priv->s_ahg->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
+ /* save to protect a change in another thread */
+ priv->s_ahg->ahgidx = qp->s_ahgidx;
+ qp->s_flags |= RVT_S_AHG_VALID;
+ }
+ } else {
+ /* subsequent middle after valid */
+ if (qp->s_ahgidx >= 0) {
+ priv->s_ahg->tx_flags |= SDMA_TXREQ_F_USE_AHG;
+ priv->s_ahg->ahgidx = qp->s_ahgidx;
+ priv->s_ahg->ahgcount++;
+ priv->s_ahg->ahgdesc[0] =
+ sdma_build_ahg_descriptor(
+ (__force u16)cpu_to_be16((u16)npsn),
+ BTH2_OFFSET,
+ 16,
+ 16);
+ if ((npsn & 0xffff0000) !=
+ (qp->s_ahgpsn & 0xffff0000)) {
+ priv->s_ahg->ahgcount++;
+ priv->s_ahg->ahgdesc[1] =
+ sdma_build_ahg_descriptor(
+ (__force u16)cpu_to_be16(
+ (u16)(npsn >> 16)),
+ BTH2_OFFSET,
+ 0,
+ 16);
+ }
+ }
+ }
+}
+
+void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
+ u32 bth0, u32 bth2, int middle,
+ struct hfi1_pkt_state *ps)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ibport *ibp = ps->ibp;
+ u16 lrh0;
+ u32 nwords;
+ u32 extra_bytes;
+ u32 bth1;
+
+ /* Construct the header. */
+ extra_bytes = -qp->s_cur_size & 3;
+ nwords = (qp->s_cur_size + extra_bytes) >> 2;
+ lrh0 = HFI1_LRH_BTH;
+ if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+ qp->s_hdrwords += hfi1_make_grh(ibp,
+ &ps->s_txreq->phdr.hdr.u.l.grh,
+ &qp->remote_ah_attr.grh,
+ qp->s_hdrwords, nwords);
+ lrh0 = HFI1_LRH_GRH;
+ middle = 0;
+ }
+ lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+ /*
+ * reset s_ahg/AHG fields
+ *
+ * This insures that the ahgentry/ahgcount
+ * are at a non-AHG default to protect
+ * build_verbs_tx_desc() from using
+ * an include ahgidx.
+ *
+ * build_ahg() will modify as appropriate
+ * to use the AHG feature.
+ */
+ priv->s_ahg->tx_flags = 0;
+ priv->s_ahg->ahgcount = 0;
+ priv->s_ahg->ahgidx = 0;
+ if (qp->s_mig_state == IB_MIG_MIGRATED)
+ bth0 |= IB_BTH_MIG_REQ;
+ else
+ middle = 0;
+ if (middle)
+ build_ahg(qp, bth2);
+ else
+ qp->s_flags &= ~RVT_S_AHG_VALID;
+ ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
+ ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+ ps->s_txreq->phdr.hdr.lrh[2] =
+ cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+ ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
+ qp->remote_ah_attr.src_path_bits);
+ bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+ bth0 |= extra_bytes << 20;
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ bth1 = qp->remote_qpn;
+ if (qp->s_flags & RVT_S_ECN) {
+ qp->s_flags &= ~RVT_S_ECN;
+ /* we recently received a FECN, so return a BECN */
+ bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+ }
+ ohdr->bth[1] = cpu_to_be32(bth1);
+ ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/* when sending, force a reschedule every one of these periods */
+#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */
+
+void _hfi1_do_send(struct work_struct *work)
+{
+ struct iowait *wait = container_of(work, struct iowait, iowork);
+ struct rvt_qp *qp = iowait_to_qp(wait);
+
+ hfi1_do_send(qp);
+}
+
+/**
+ * hfi1_do_send - perform a send on a QP
+ * @work: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted. Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void hfi1_do_send(struct rvt_qp *qp)
+{
+ struct hfi1_pkt_state ps;
+ struct hfi1_qp_priv *priv = qp->priv;
+ int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+ unsigned long timeout;
+ unsigned long timeout_int;
+ int cpu;
+
+ ps.dev = to_idev(qp->ibqp.device);
+ ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ps.ppd = ppd_from_ibp(ps.ibp);
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_RC:
+ if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
+ ) - 1)) ==
+ ps.ppd->lid)) {
+ ruc_loopback(qp);
+ return;
+ }
+ make_req = hfi1_make_rc_req;
+ timeout_int = (qp->timeout_jiffies);
+ break;
+ case IB_QPT_UC:
+ if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
+ ) - 1)) ==
+ ps.ppd->lid)) {
+ ruc_loopback(qp);
+ return;
+ }
+ make_req = hfi1_make_uc_req;
+ timeout_int = SEND_RESCHED_TIMEOUT;
+ break;
+ default:
+ make_req = hfi1_make_ud_req;
+ timeout_int = SEND_RESCHED_TIMEOUT;
+ }
+
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+ /* Return if we are already busy processing a work request. */
+ if (!hfi1_send_ok(qp)) {
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+ return;
+ }
+
+ qp->s_flags |= RVT_S_BUSY;
+
+ timeout = jiffies + (timeout_int) / 8;
+ cpu = priv->s_sde ? priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+ /* insure a pre-built packet is handled */
+ ps.s_txreq = get_waiting_verbs_txreq(qp);
+ do {
+ /* Check for a constructed packet to be sent. */
+ if (qp->s_hdrwords != 0) {
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+ /*
+ * If the packet cannot be sent now, return and
+ * the send tasklet will be woken up later.
+ */
+ if (hfi1_verbs_send(qp, &ps))
+ return;
+ /* Record that s_ahg is empty. */
+ qp->s_hdrwords = 0;
+ /* allow other tasks to run */
+ if (unlikely(time_after(jiffies, timeout))) {
+ if (workqueue_congested(cpu,
+ ps.ppd->hfi1_wq)) {
+ spin_lock_irqsave(
+ &qp->s_lock,
+ ps.flags);
+ qp->s_flags &= ~RVT_S_BUSY;
+ hfi1_schedule_send(qp);
+ spin_unlock_irqrestore(
+ &qp->s_lock,
+ ps.flags);
+ this_cpu_inc(
+ *ps.ppd->dd->send_schedule);
+ return;
+ }
+ if (!irqs_disabled()) {
+ cond_resched();
+ this_cpu_inc(
+ *ps.ppd->dd->send_schedule);
+ }
+ timeout = jiffies + (timeout_int) / 8;
+ }
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+ }
+ } while (make_req(qp, &ps));
+
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ enum ib_wc_status status)
+{
+ u32 old_last, last;
+ unsigned i;
+
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+ return;
+
+ last = qp->s_last;
+ old_last = last;
+ if (++last >= qp->s_size)
+ last = 0;
+ qp->s_last = last;
+ /* See post_send() */
+ barrier();
+ for (i = 0; i < wqe->wr.num_sge; i++) {
+ struct rvt_sge *sge = &wqe->sg_list[i];
+
+ rvt_put_mr(sge->mr);
+ }
+ if (qp->ibqp.qp_type == IB_QPT_UD ||
+ qp->ibqp.qp_type == IB_QPT_SMI ||
+ qp->ibqp.qp_type == IB_QPT_GSI)
+ atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
+
+ /* See ch. 11.2.4.1 and 10.7.3.1 */
+ if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+ status != IB_WC_SUCCESS) {
+ struct ib_wc wc;
+
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr.wr_id;
+ wc.status = status;
+ wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+ wc.qp = &qp->ibqp;
+ if (status == IB_WC_SUCCESS)
+ wc.byte_len = wqe->length;
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc,
+ status != IB_WC_SUCCESS);
+ }
+
+ if (qp->s_acked == old_last)
+ qp->s_acked = last;
+ if (qp->s_cur == old_last)
+ qp->s_cur = last;
+ if (qp->s_tail == old_last)
+ qp->s_tail = last;
+ if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+ qp->s_draining = 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
new file mode 100644
index 000000000000..f9befc05b349
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -0,0 +1,3054 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/bitops.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "qp.h"
+#include "sdma.h"
+#include "iowait.h"
+#include "trace.h"
+
+/* must be a power of 2 >= 64 <= 32768 */
+#define SDMA_DESCQ_CNT 2048
+#define SDMA_DESC_INTR 64
+#define INVALID_TAIL 0xffff
+
+static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
+module_param(sdma_descq_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+static uint sdma_idle_cnt = 250;
+module_param(sdma_idle_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
+
+uint mod_num_sdma;
+module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
+MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
+
+static uint sdma_desct_intr = SDMA_DESC_INTR;
+module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt");
+
+#define SDMA_WAIT_BATCH_SIZE 20
+/* max wait time for a SDMA engine to indicate it has halted */
+#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
+/* all SDMA engine errors that cause a halt */
+
+#define SD(name) SEND_DMA_##name
+#define ALL_SDMA_ENG_HALT_ERRS \
+ (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
+ | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
+
+/* sdma_sendctrl operations */
+#define SDMA_SENDCTRL_OP_ENABLE BIT(0)
+#define SDMA_SENDCTRL_OP_INTENABLE BIT(1)
+#define SDMA_SENDCTRL_OP_HALT BIT(2)
+#define SDMA_SENDCTRL_OP_CLEANUP BIT(3)
+
+/* handle long defines */
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
+
+static const char * const sdma_state_names[] = {
+ [sdma_state_s00_hw_down] = "s00_HwDown",
+ [sdma_state_s10_hw_start_up_halt_wait] = "s10_HwStartUpHaltWait",
+ [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
+ [sdma_state_s20_idle] = "s20_Idle",
+ [sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait",
+ [sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait",
+ [sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait",
+ [sdma_state_s60_idle_halt_wait] = "s60_IdleHaltWait",
+ [sdma_state_s80_hw_freeze] = "s80_HwFreeze",
+ [sdma_state_s82_freeze_sw_clean] = "s82_FreezeSwClean",
+ [sdma_state_s99_running] = "s99_Running",
+};
+
+#ifdef CONFIG_SDMA_VERBOSITY
+static const char * const sdma_event_names[] = {
+ [sdma_event_e00_go_hw_down] = "e00_GoHwDown",
+ [sdma_event_e10_go_hw_start] = "e10_GoHwStart",
+ [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
+ [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
+ [sdma_event_e30_go_running] = "e30_GoRunning",
+ [sdma_event_e40_sw_cleaned] = "e40_SwCleaned",
+ [sdma_event_e50_hw_cleaned] = "e50_HwCleaned",
+ [sdma_event_e60_hw_halted] = "e60_HwHalted",
+ [sdma_event_e70_go_idle] = "e70_GoIdle",
+ [sdma_event_e80_hw_freeze] = "e80_HwFreeze",
+ [sdma_event_e81_hw_frozen] = "e81_HwFrozen",
+ [sdma_event_e82_hw_unfreeze] = "e82_HwUnfreeze",
+ [sdma_event_e85_link_down] = "e85_LinkDown",
+ [sdma_event_e90_sw_halted] = "e90_SwHalted",
+};
+#endif
+
+static const struct sdma_set_state_action sdma_action_table[] = {
+ [sdma_state_s00_hw_down] = {
+ .go_s99_running_tofalse = 1,
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s10_hw_start_up_halt_wait] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 1,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s15_hw_start_up_clean_wait] = {
+ .op_enable = 0,
+ .op_intenable = 1,
+ .op_halt = 0,
+ .op_cleanup = 1,
+ },
+ [sdma_state_s20_idle] = {
+ .op_enable = 0,
+ .op_intenable = 1,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s30_sw_clean_up_wait] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s40_hw_clean_up_wait] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 1,
+ },
+ [sdma_state_s50_hw_halt_wait] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s60_idle_halt_wait] = {
+ .go_s99_running_tofalse = 1,
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 1,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s80_hw_freeze] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s82_freeze_sw_clean] = {
+ .op_enable = 0,
+ .op_intenable = 0,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ },
+ [sdma_state_s99_running] = {
+ .op_enable = 1,
+ .op_intenable = 1,
+ .op_halt = 0,
+ .op_cleanup = 0,
+ .go_s99_running_totrue = 1,
+ },
+};
+
+#define SDMA_TAIL_UPDATE_THRESH 0x1F
+
+/* declare all statics here rather than keep sorting */
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct sdma_state *);
+static void sdma_get(struct sdma_state *);
+static void sdma_hw_clean_up_task(unsigned long);
+static void sdma_put(struct sdma_state *);
+static void sdma_set_state(struct sdma_engine *, enum sdma_states);
+static void sdma_start_hw_clean_up(struct sdma_engine *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void sdma_sendctrl(struct sdma_engine *, unsigned);
+static void init_sdma_regs(struct sdma_engine *, u32, uint);
+static void sdma_process_event(
+ struct sdma_engine *sde,
+ enum sdma_events event);
+static void __sdma_process_event(
+ struct sdma_engine *sde,
+ enum sdma_events event);
+static void dump_sdma_state(struct sdma_engine *sde);
+static void sdma_make_progress(struct sdma_engine *sde, u64 status);
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
+static void sdma_flush_descq(struct sdma_engine *sde);
+
+/**
+ * sdma_state_name() - return state string from enum
+ * @state: state
+ */
+static const char *sdma_state_name(enum sdma_states state)
+{
+ return sdma_state_names[state];
+}
+
+static void sdma_get(struct sdma_state *ss)
+{
+ kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+ struct sdma_state *ss =
+ container_of(kref, struct sdma_state, kref);
+
+ complete(&ss->comp);
+}
+
+static void sdma_put(struct sdma_state *ss)
+{
+ kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct sdma_state *ss)
+{
+ sdma_put(ss);
+ wait_for_completion(&ss->comp);
+}
+
+static inline void write_sde_csr(
+ struct sdma_engine *sde,
+ u32 offset0,
+ u64 value)
+{
+ write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
+}
+
+static inline u64 read_sde_csr(
+ struct sdma_engine *sde,
+ u32 offset0)
+{
+ return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
+}
+
+/*
+ * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
+ * sdma engine 'sde' to drop to 0.
+ */
+static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
+ int pause)
+{
+ u64 off = 8 * sde->this_idx;
+ struct hfi1_devdata *dd = sde->dd;
+ int lcnt = 0;
+ u64 reg_prev;
+ u64 reg = 0;
+
+ while (1) {
+ reg_prev = reg;
+ reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
+
+ reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
+ reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
+ if (reg == 0)
+ break;
+ /* counter is reest if accupancy count changes */
+ if (reg != reg_prev)
+ lcnt = 0;
+ if (lcnt++ > 500) {
+ /* timed out - bounce the link */
+ dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
+ __func__, sde->this_idx, (u32)reg);
+ queue_work(dd->pport->hfi1_wq,
+ &dd->pport->link_bounce_work);
+ break;
+ }
+ udelay(1);
+ }
+}
+
+/*
+ * sdma_wait() - wait for packet egress to complete for all SDMA engines,
+ * and pause for credit return.
+ */
+void sdma_wait(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < dd->num_sdma; i++) {
+ struct sdma_engine *sde = &dd->per_sdma[i];
+
+ sdma_wait_for_packet_egress(sde, 0);
+ }
+}
+
+static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
+{
+ u64 reg;
+
+ if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
+ return;
+ reg = cnt;
+ reg &= SD(DESC_CNT_CNT_MASK);
+ reg <<= SD(DESC_CNT_CNT_SHIFT);
+ write_sde_csr(sde, SD(DESC_CNT), reg);
+}
+
+static inline void complete_tx(struct sdma_engine *sde,
+ struct sdma_txreq *tx,
+ int res)
+{
+ /* protect against complete modifying */
+ struct iowait *wait = tx->wait;
+ callback_t complete = tx->complete;
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ trace_hfi1_sdma_out_sn(sde, tx->sn);
+ if (WARN_ON_ONCE(sde->head_sn != tx->sn))
+ dd_dev_err(sde->dd, "expected %llu got %llu\n",
+ sde->head_sn, tx->sn);
+ sde->head_sn++;
+#endif
+ sdma_txclean(sde->dd, tx);
+ if (complete)
+ (*complete)(tx, res);
+ if (wait && iowait_sdma_dec(wait))
+ iowait_drain_wakeup(wait);
+}
+
+/*
+ * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
+ *
+ * Depending on timing there can be txreqs in two places:
+ * - in the descq ring
+ * - in the flush list
+ *
+ * To avoid ordering issues the descq ring needs to be flushed
+ * first followed by the flush list.
+ *
+ * This routine is called from two places
+ * - From a work queue item
+ * - Directly from the state machine just before setting the
+ * state to running
+ *
+ * Must be called with head_lock held
+ *
+ */
+static void sdma_flush(struct sdma_engine *sde)
+{
+ struct sdma_txreq *txp, *txp_next;
+ LIST_HEAD(flushlist);
+ unsigned long flags;
+
+ /* flush from head to tail */
+ sdma_flush_descq(sde);
+ spin_lock_irqsave(&sde->flushlist_lock, flags);
+ /* copy flush list */
+ list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
+ list_del_init(&txp->list);
+ list_add_tail(&txp->list, &flushlist);
+ }
+ spin_unlock_irqrestore(&sde->flushlist_lock, flags);
+ /* flush from flush list */
+ list_for_each_entry_safe(txp, txp_next, &flushlist, list)
+ complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+}
+
+/*
+ * Fields a work request for flushing the descq ring
+ * and the flush list
+ *
+ * If the engine has been brought to running during
+ * the scheduling delay, the flush is ignored, assuming
+ * that the process of bringing the engine to running
+ * would have done this flush prior to going to running.
+ *
+ */
+static void sdma_field_flush(struct work_struct *work)
+{
+ unsigned long flags;
+ struct sdma_engine *sde =
+ container_of(work, struct sdma_engine, flush_worker);
+
+ write_seqlock_irqsave(&sde->head_lock, flags);
+ if (!__sdma_running(sde))
+ sdma_flush(sde);
+ write_sequnlock_irqrestore(&sde->head_lock, flags);
+}
+
+static void sdma_err_halt_wait(struct work_struct *work)
+{
+ struct sdma_engine *sde = container_of(work, struct sdma_engine,
+ err_halt_worker);
+ u64 statuscsr;
+ unsigned long timeout;
+
+ timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
+ while (1) {
+ statuscsr = read_sde_csr(sde, SD(STATUS));
+ statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
+ if (statuscsr)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(sde->dd,
+ "SDMA engine %d - timeout waiting for engine to halt\n",
+ sde->this_idx);
+ /*
+ * Continue anyway. This could happen if there was
+ * an uncorrectable error in the wrong spot.
+ */
+ break;
+ }
+ usleep_range(80, 120);
+ }
+
+ sdma_process_event(sde, sdma_event_e15_hw_halt_done);
+}
+
+static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
+{
+ if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
+ unsigned index;
+ struct hfi1_devdata *dd = sde->dd;
+
+ for (index = 0; index < dd->num_sdma; index++) {
+ struct sdma_engine *curr_sdma = &dd->per_sdma[index];
+
+ if (curr_sdma != sde)
+ curr_sdma->progress_check_head =
+ curr_sdma->descq_head;
+ }
+ dd_dev_err(sde->dd,
+ "SDMA engine %d - check scheduled\n",
+ sde->this_idx);
+ mod_timer(&sde->err_progress_check_timer, jiffies + 10);
+ }
+}
+
+static void sdma_err_progress_check(unsigned long data)
+{
+ unsigned index;
+ struct sdma_engine *sde = (struct sdma_engine *)data;
+
+ dd_dev_err(sde->dd, "SDE progress check event\n");
+ for (index = 0; index < sde->dd->num_sdma; index++) {
+ struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
+ unsigned long flags;
+
+ /* check progress on each engine except the current one */
+ if (curr_sde == sde)
+ continue;
+ /*
+ * We must lock interrupts when acquiring sde->lock,
+ * to avoid a deadlock if interrupt triggers and spins on
+ * the same lock on same CPU
+ */
+ spin_lock_irqsave(&curr_sde->tail_lock, flags);
+ write_seqlock(&curr_sde->head_lock);
+
+ /* skip non-running queues */
+ if (curr_sde->state.current_state != sdma_state_s99_running) {
+ write_sequnlock(&curr_sde->head_lock);
+ spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+ continue;
+ }
+
+ if ((curr_sde->descq_head != curr_sde->descq_tail) &&
+ (curr_sde->descq_head ==
+ curr_sde->progress_check_head))
+ __sdma_process_event(curr_sde,
+ sdma_event_e90_sw_halted);
+ write_sequnlock(&curr_sde->head_lock);
+ spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+ }
+ schedule_work(&sde->err_halt_worker);
+}
+
+static void sdma_hw_clean_up_task(unsigned long opaque)
+{
+ struct sdma_engine *sde = (struct sdma_engine *)opaque;
+ u64 statuscsr;
+
+ while (1) {
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__,
+ __func__);
+#endif
+ statuscsr = read_sde_csr(sde, SD(STATUS));
+ statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
+ if (statuscsr)
+ break;
+ udelay(10);
+ }
+
+ sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
+}
+
+static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
+{
+ smp_read_barrier_depends(); /* see sdma_update_tail() */
+ return sde->tx_ring[sde->tx_head & sde->sdma_mask];
+}
+
+/*
+ * flush ring for recovery
+ */
+static void sdma_flush_descq(struct sdma_engine *sde)
+{
+ u16 head, tail;
+ int progress = 0;
+ struct sdma_txreq *txp = get_txhead(sde);
+
+ /* The reason for some of the complexity of this code is that
+ * not all descriptors have corresponding txps. So, we have to
+ * be able to skip over descs until we wander into the range of
+ * the next txp on the list.
+ */
+ head = sde->descq_head & sde->sdma_mask;
+ tail = sde->descq_tail & sde->sdma_mask;
+ while (head != tail) {
+ /* advance head, wrap if needed */
+ head = ++sde->descq_head & sde->sdma_mask;
+ /* if now past this txp's descs, do the callback */
+ if (txp && txp->next_descq_idx == head) {
+ /* remove from list */
+ sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+ complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+ trace_hfi1_sdma_progress(sde, head, tail, txp);
+ txp = get_txhead(sde);
+ }
+ progress++;
+ }
+ if (progress)
+ sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+ struct sdma_engine *sde = (struct sdma_engine *)opaque;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sde->tail_lock, flags);
+ write_seqlock(&sde->head_lock);
+
+ /*
+ * At this point, the following should always be true:
+ * - We are halted, so no more descriptors are getting retired.
+ * - We are not running, so no one is submitting new work.
+ * - Only we can send the e40_sw_cleaned, so we can't start
+ * running again until we say so. So, the active list and
+ * descq are ours to play with.
+ */
+
+ /*
+ * In the error clean up sequence, software clean must be called
+ * before the hardware clean so we can use the hardware head in
+ * the progress routine. A hardware clean or SPC unfreeze will
+ * reset the hardware head.
+ *
+ * Process all retired requests. The progress routine will use the
+ * latest physical hardware head - we are not running so speed does
+ * not matter.
+ */
+ sdma_make_progress(sde, 0);
+
+ sdma_flush(sde);
+
+ /*
+ * Reset our notion of head and tail.
+ * Note that the HW registers have been reset via an earlier
+ * clean up.
+ */
+ sde->descq_tail = 0;
+ sde->descq_head = 0;
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ *sde->head_dma = 0;
+
+ __sdma_process_event(sde, sdma_event_e40_sw_cleaned);
+
+ write_sequnlock(&sde->head_lock);
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sw_tear_down(struct sdma_engine *sde)
+{
+ struct sdma_state *ss = &sde->state;
+
+ /* Releasing this reference means the state machine has stopped. */
+ sdma_put(ss);
+
+ /* stop waiting for all unfreeze events to complete */
+ atomic_set(&sde->dd->sdma_unfreeze_count, -1);
+ wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+}
+
+static void sdma_start_hw_clean_up(struct sdma_engine *sde)
+{
+ tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
+}
+
+static void sdma_set_state(struct sdma_engine *sde,
+ enum sdma_states next_state)
+{
+ struct sdma_state *ss = &sde->state;
+ const struct sdma_set_state_action *action = sdma_action_table;
+ unsigned op = 0;
+
+ trace_hfi1_sdma_state(
+ sde,
+ sdma_state_names[ss->current_state],
+ sdma_state_names[next_state]);
+
+ /* debugging bookkeeping */
+ ss->previous_state = ss->current_state;
+ ss->previous_op = ss->current_op;
+ ss->current_state = next_state;
+
+ if (ss->previous_state != sdma_state_s99_running &&
+ next_state == sdma_state_s99_running)
+ sdma_flush(sde);
+
+ if (action[next_state].op_enable)
+ op |= SDMA_SENDCTRL_OP_ENABLE;
+
+ if (action[next_state].op_intenable)
+ op |= SDMA_SENDCTRL_OP_INTENABLE;
+
+ if (action[next_state].op_halt)
+ op |= SDMA_SENDCTRL_OP_HALT;
+
+ if (action[next_state].op_cleanup)
+ op |= SDMA_SENDCTRL_OP_CLEANUP;
+
+ if (action[next_state].go_s99_running_tofalse)
+ ss->go_s99_running = 0;
+
+ if (action[next_state].go_s99_running_totrue)
+ ss->go_s99_running = 1;
+
+ ss->current_op = op;
+ sdma_sendctrl(sde, ss->current_op);
+}
+
+/**
+ * sdma_get_descq_cnt() - called when device probed
+ *
+ * Return a validated descq count.
+ *
+ * This is currently only used in the verbs initialization to build the tx
+ * list.
+ *
+ * This will probably be deleted in favor of a more scalable approach to
+ * alloc tx's.
+ *
+ */
+u16 sdma_get_descq_cnt(void)
+{
+ u16 count = sdma_descq_cnt;
+
+ if (!count)
+ return SDMA_DESCQ_CNT;
+ /* count must be a power of 2 greater than 64 and less than
+ * 32768. Otherwise return default.
+ */
+ if (!is_power_of_2(count))
+ return SDMA_DESCQ_CNT;
+ if (count < 64 || count > 32768)
+ return SDMA_DESCQ_CNT;
+ return count;
+}
+
+/**
+ * sdma_select_engine_vl() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ *
+ * This function returns an engine based on the selector and a vl. The
+ * mapping fields are protected by RCU.
+ */
+struct sdma_engine *sdma_select_engine_vl(
+ struct hfi1_devdata *dd,
+ u32 selector,
+ u8 vl)
+{
+ struct sdma_vl_map *m;
+ struct sdma_map_elem *e;
+ struct sdma_engine *rval;
+
+ /* NOTE This should only happen if SC->VL changed after the initial
+ * checks on the QP/AH
+ * Default will return engine 0 below
+ */
+ if (vl >= num_vls) {
+ rval = NULL;
+ goto done;
+ }
+
+ rcu_read_lock();
+ m = rcu_dereference(dd->sdma_map);
+ if (unlikely(!m)) {
+ rcu_read_unlock();
+ return &dd->per_sdma[0];
+ }
+ e = m->map[vl & m->mask];
+ rval = e->sde[selector & e->mask];
+ rcu_read_unlock();
+
+done:
+ rval = !rval ? &dd->per_sdma[0] : rval;
+ trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
+ return rval;
+}
+
+/**
+ * sdma_select_engine_sc() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ *
+ * This function returns an engine based on the selector and an sc.
+ */
+struct sdma_engine *sdma_select_engine_sc(
+ struct hfi1_devdata *dd,
+ u32 selector,
+ u8 sc5)
+{
+ u8 vl = sc_to_vlt(dd, sc5);
+
+ return sdma_select_engine_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void sdma_map_free(struct sdma_vl_map *m)
+{
+ int i;
+
+ for (i = 0; m && i < m->actual_vls; i++)
+ kfree(m->map[i]);
+ kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void sdma_map_rcu_callback(struct rcu_head *list)
+{
+ struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
+
+ sdma_map_free(m);
+}
+
+/**
+ * sdma_map_init - called when # vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_engines: per vl engine mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_engines is used to specify a non-uniform vl/engine loading. NULL
+ * implies auto computing the loading and giving each VLs a uniform
+ * distribution of engines per VL.
+ *
+ * The auto algorithm computes the sde_per_vl and the number of extra
+ * engines. Any extra engines are added from the last VL on down.
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_sdma are non-power of 2, the array sizes
+ * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
+ * up to the next highest power of 2 and the first entry is reused
+ * in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is
+ * not changed.
+ *
+ */
+int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
+{
+ int i, j;
+ int extra, sde_per_vl;
+ int engine = 0;
+ u8 lvl_engines[OPA_MAX_VLS];
+ struct sdma_vl_map *oldmap, *newmap;
+
+ if (!(dd->flags & HFI1_HAS_SEND_DMA))
+ return 0;
+
+ if (!vl_engines) {
+ /* truncate divide */
+ sde_per_vl = dd->num_sdma / num_vls;
+ /* extras */
+ extra = dd->num_sdma % num_vls;
+ vl_engines = lvl_engines;
+ /* add extras from last vl down */
+ for (i = num_vls - 1; i >= 0; i--, extra--)
+ vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
+ }
+ /* build new map */
+ newmap = kzalloc(
+ sizeof(struct sdma_vl_map) +
+ roundup_pow_of_two(num_vls) *
+ sizeof(struct sdma_map_elem *),
+ GFP_KERNEL);
+ if (!newmap)
+ goto bail;
+ newmap->actual_vls = num_vls;
+ newmap->vls = roundup_pow_of_two(num_vls);
+ newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+ /* initialize back-map */
+ for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++)
+ newmap->engine_to_vl[i] = -1;
+ for (i = 0; i < newmap->vls; i++) {
+ /* save for wrap around */
+ int first_engine = engine;
+
+ if (i < newmap->actual_vls) {
+ int sz = roundup_pow_of_two(vl_engines[i]);
+
+ /* only allocate once */
+ newmap->map[i] = kzalloc(
+ sizeof(struct sdma_map_elem) +
+ sz * sizeof(struct sdma_engine *),
+ GFP_KERNEL);
+ if (!newmap->map[i])
+ goto bail;
+ newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+ /* assign engines */
+ for (j = 0; j < sz; j++) {
+ newmap->map[i]->sde[j] =
+ &dd->per_sdma[engine];
+ if (++engine >= first_engine + vl_engines[i])
+ /* wrap back to first engine */
+ engine = first_engine;
+ }
+ /* assign back-map */
+ for (j = 0; j < vl_engines[i]; j++)
+ newmap->engine_to_vl[first_engine + j] = i;
+ } else {
+ /* just re-use entry without allocating */
+ newmap->map[i] = newmap->map[i % num_vls];
+ }
+ engine = first_engine + vl_engines[i];
+ }
+ /* newmap in hand, save old map */
+ spin_lock_irq(&dd->sde_map_lock);
+ oldmap = rcu_dereference_protected(dd->sdma_map,
+ lockdep_is_held(&dd->sde_map_lock));
+
+ /* publish newmap */
+ rcu_assign_pointer(dd->sdma_map, newmap);
+
+ spin_unlock_irq(&dd->sde_map_lock);
+ /* success, free any old map after grace period */
+ if (oldmap)
+ call_rcu(&oldmap->list, sdma_map_rcu_callback);
+ return 0;
+bail:
+ /* free any partial allocation */
+ sdma_map_free(newmap);
+ return -ENOMEM;
+}
+
+/*
+ * Clean up allocated memory.
+ *
+ * This routine is can be called regardless of the success of sdma_init()
+ *
+ */
+static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
+{
+ size_t i;
+ struct sdma_engine *sde;
+
+ if (dd->sdma_pad_dma) {
+ dma_free_coherent(&dd->pcidev->dev, 4,
+ (void *)dd->sdma_pad_dma,
+ dd->sdma_pad_phys);
+ dd->sdma_pad_dma = NULL;
+ dd->sdma_pad_phys = 0;
+ }
+ if (dd->sdma_heads_dma) {
+ dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
+ (void *)dd->sdma_heads_dma,
+ dd->sdma_heads_phys);
+ dd->sdma_heads_dma = NULL;
+ dd->sdma_heads_phys = 0;
+ }
+ for (i = 0; dd->per_sdma && i < num_engines; ++i) {
+ sde = &dd->per_sdma[i];
+
+ sde->head_dma = NULL;
+ sde->head_phys = 0;
+
+ if (sde->descq) {
+ dma_free_coherent(
+ &dd->pcidev->dev,
+ sde->descq_cnt * sizeof(u64[2]),
+ sde->descq,
+ sde->descq_phys
+ );
+ sde->descq = NULL;
+ sde->descq_phys = 0;
+ }
+ kvfree(sde->tx_ring);
+ sde->tx_ring = NULL;
+ }
+ spin_lock_irq(&dd->sde_map_lock);
+ sdma_map_free(rcu_access_pointer(dd->sdma_map));
+ RCU_INIT_POINTER(dd->sdma_map, NULL);
+ spin_unlock_irq(&dd->sde_map_lock);
+ synchronize_rcu();
+ kfree(dd->per_sdma);
+ dd->per_sdma = NULL;
+}
+
+/**
+ * sdma_init() - called when device probed
+ * @dd: hfi1_devdata
+ * @port: port number (currently only zero)
+ *
+ * sdma_init initializes the specified number of engines.
+ *
+ * The code initializes each sde, its csrs. Interrupts
+ * are not required to be enabled.
+ *
+ * Returns:
+ * 0 - success, -errno on failure
+ */
+int sdma_init(struct hfi1_devdata *dd, u8 port)
+{
+ unsigned this_idx;
+ struct sdma_engine *sde;
+ u16 descq_cnt;
+ void *curr_head;
+ struct hfi1_pportdata *ppd = dd->pport + port;
+ u32 per_sdma_credits;
+ uint idle_cnt = sdma_idle_cnt;
+ size_t num_engines = dd->chip_sdma_engines;
+
+ if (!HFI1_CAP_IS_KSET(SDMA)) {
+ HFI1_CAP_CLEAR(SDMA_AHG);
+ return 0;
+ }
+ if (mod_num_sdma &&
+ /* can't exceed chip support */
+ mod_num_sdma <= dd->chip_sdma_engines &&
+ /* count must be >= vls */
+ mod_num_sdma >= num_vls)
+ num_engines = mod_num_sdma;
+
+ dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
+ dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
+ dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
+ dd->chip_sdma_mem_size);
+
+ per_sdma_credits =
+ dd->chip_sdma_mem_size / (num_engines * SDMA_BLOCK_SIZE);
+
+ /* set up freeze waitqueue */
+ init_waitqueue_head(&dd->sdma_unfreeze_wq);
+ atomic_set(&dd->sdma_unfreeze_count, 0);
+
+ descq_cnt = sdma_get_descq_cnt();
+ dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
+ num_engines, descq_cnt);
+
+ /* alloc memory for array of send engines */
+ dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL);
+ if (!dd->per_sdma)
+ return -ENOMEM;
+
+ idle_cnt = ns_to_cclock(dd, idle_cnt);
+ if (!sdma_desct_intr)
+ sdma_desct_intr = SDMA_DESC_INTR;
+
+ /* Allocate memory for SendDMA descriptor FIFOs */
+ for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+ sde = &dd->per_sdma[this_idx];
+ sde->dd = dd;
+ sde->ppd = ppd;
+ sde->this_idx = this_idx;
+ sde->descq_cnt = descq_cnt;
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ sde->sdma_shift = ilog2(descq_cnt);
+ sde->sdma_mask = (1 << sde->sdma_shift) - 1;
+
+ /* Create a mask specifically for each interrupt source */
+ sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES +
+ this_idx);
+ sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES +
+ this_idx);
+ sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES +
+ this_idx);
+ /* Create a combined mask to cover all 3 interrupt sources */
+ sde->imask = sde->int_mask | sde->progress_mask |
+ sde->idle_mask;
+
+ spin_lock_init(&sde->tail_lock);
+ seqlock_init(&sde->head_lock);
+ spin_lock_init(&sde->senddmactrl_lock);
+ spin_lock_init(&sde->flushlist_lock);
+ /* insure there is always a zero bit */
+ sde->ahg_bits = 0xfffffffe00000000ULL;
+
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+
+ /* set up reference counting */
+ kref_init(&sde->state.kref);
+ init_completion(&sde->state.comp);
+
+ INIT_LIST_HEAD(&sde->flushlist);
+ INIT_LIST_HEAD(&sde->dmawait);
+
+ sde->tail_csr =
+ get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
+
+ if (idle_cnt)
+ dd->default_desc1 =
+ SDMA_DESC1_HEAD_TO_HOST_FLAG;
+ else
+ dd->default_desc1 =
+ SDMA_DESC1_INT_REQ_FLAG;
+
+ tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
+ (unsigned long)sde);
+
+ tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+ (unsigned long)sde);
+ INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
+ INIT_WORK(&sde->flush_worker, sdma_field_flush);
+
+ sde->progress_check_head = 0;
+
+ setup_timer(&sde->err_progress_check_timer,
+ sdma_err_progress_check, (unsigned long)sde);
+
+ sde->descq = dma_zalloc_coherent(
+ &dd->pcidev->dev,
+ descq_cnt * sizeof(u64[2]),
+ &sde->descq_phys,
+ GFP_KERNEL
+ );
+ if (!sde->descq)
+ goto bail;
+ sde->tx_ring =
+ kcalloc(descq_cnt, sizeof(struct sdma_txreq *),
+ GFP_KERNEL);
+ if (!sde->tx_ring)
+ sde->tx_ring =
+ vzalloc(
+ sizeof(struct sdma_txreq *) *
+ descq_cnt);
+ if (!sde->tx_ring)
+ goto bail;
+ }
+
+ dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
+ /* Allocate memory for DMA of head registers to memory */
+ dd->sdma_heads_dma = dma_zalloc_coherent(
+ &dd->pcidev->dev,
+ dd->sdma_heads_size,
+ &dd->sdma_heads_phys,
+ GFP_KERNEL
+ );
+ if (!dd->sdma_heads_dma) {
+ dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
+ goto bail;
+ }
+
+ /* Allocate memory for pad */
+ dd->sdma_pad_dma = dma_zalloc_coherent(
+ &dd->pcidev->dev,
+ sizeof(u32),
+ &dd->sdma_pad_phys,
+ GFP_KERNEL
+ );
+ if (!dd->sdma_pad_dma) {
+ dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
+ goto bail;
+ }
+
+ /* assign each engine to different cacheline and init registers */
+ curr_head = (void *)dd->sdma_heads_dma;
+ for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+ unsigned long phys_offset;
+
+ sde = &dd->per_sdma[this_idx];
+
+ sde->head_dma = curr_head;
+ curr_head += L1_CACHE_BYTES;
+ phys_offset = (unsigned long)sde->head_dma -
+ (unsigned long)dd->sdma_heads_dma;
+ sde->head_phys = dd->sdma_heads_phys + phys_offset;
+ init_sdma_regs(sde, per_sdma_credits, idle_cnt);
+ }
+ dd->flags |= HFI1_HAS_SEND_DMA;
+ dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
+ dd->num_sdma = num_engines;
+ if (sdma_map_init(dd, port, ppd->vls_operational, NULL))
+ goto bail;
+ dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
+ return 0;
+
+bail:
+ sdma_clean(dd, num_engines);
+ return -ENOMEM;
+}
+
+/**
+ * sdma_all_running() - called when the link goes up
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the running state.
+ */
+void sdma_all_running(struct hfi1_devdata *dd)
+{
+ struct sdma_engine *sde;
+ unsigned int i;
+
+ /* move all engines to running */
+ for (i = 0; i < dd->num_sdma; ++i) {
+ sde = &dd->per_sdma[i];
+ sdma_process_event(sde, sdma_event_e30_go_running);
+ }
+}
+
+/**
+ * sdma_all_idle() - called when the link goes down
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the idle state.
+ */
+void sdma_all_idle(struct hfi1_devdata *dd)
+{
+ struct sdma_engine *sde;
+ unsigned int i;
+
+ /* idle all engines */
+ for (i = 0; i < dd->num_sdma; ++i) {
+ sde = &dd->per_sdma[i];
+ sdma_process_event(sde, sdma_event_e70_go_idle);
+ }
+}
+
+/**
+ * sdma_start() - called to kick off state processing for all engines
+ * @dd: hfi1_devdata
+ *
+ * This routine is for kicking off the state processing for all required
+ * sdma engines. Interrupts need to be working at this point.
+ *
+ */
+void sdma_start(struct hfi1_devdata *dd)
+{
+ unsigned i;
+ struct sdma_engine *sde;
+
+ /* kick off the engines state processing */
+ for (i = 0; i < dd->num_sdma; ++i) {
+ sde = &dd->per_sdma[i];
+ sdma_process_event(sde, sdma_event_e10_go_hw_start);
+ }
+}
+
+/**
+ * sdma_exit() - used when module is removed
+ * @dd: hfi1_devdata
+ */
+void sdma_exit(struct hfi1_devdata *dd)
+{
+ unsigned this_idx;
+ struct sdma_engine *sde;
+
+ for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
+ ++this_idx) {
+ sde = &dd->per_sdma[this_idx];
+ if (!list_empty(&sde->dmawait))
+ dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
+ sde->this_idx);
+ sdma_process_event(sde, sdma_event_e00_go_hw_down);
+
+ del_timer_sync(&sde->err_progress_check_timer);
+
+ /*
+ * This waits for the state machine to exit so it is not
+ * necessary to kill the sdma_sw_clean_up_task to make sure
+ * it is not running.
+ */
+ sdma_finalput(&sde->state);
+ }
+ sdma_clean(dd, dd->num_sdma);
+}
+
+/*
+ * unmap the indicated descriptor
+ */
+static inline void sdma_unmap_desc(
+ struct hfi1_devdata *dd,
+ struct sdma_desc *descp)
+{
+ switch (sdma_mapping_type(descp)) {
+ case SDMA_MAP_SINGLE:
+ dma_unmap_single(
+ &dd->pcidev->dev,
+ sdma_mapping_addr(descp),
+ sdma_mapping_len(descp),
+ DMA_TO_DEVICE);
+ break;
+ case SDMA_MAP_PAGE:
+ dma_unmap_page(
+ &dd->pcidev->dev,
+ sdma_mapping_addr(descp),
+ sdma_mapping_len(descp),
+ DMA_TO_DEVICE);
+ break;
+ }
+}
+
+/*
+ * return the mode as indicated by the first
+ * descriptor in the tx.
+ */
+static inline u8 ahg_mode(struct sdma_txreq *tx)
+{
+ return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+ >> SDMA_DESC1_HEADER_MODE_SHIFT;
+}
+
+/**
+ * sdma_txclean() - clean tx of mappings, descp *kmalloc's
+ * @dd: hfi1_devdata for unmapping
+ * @tx: tx request to clean
+ *
+ * This is used in the progress routine to clean the tx or
+ * by the ULP to toss an in-process tx build.
+ *
+ * The code can be called multiple times without issue.
+ *
+ */
+void sdma_txclean(
+ struct hfi1_devdata *dd,
+ struct sdma_txreq *tx)
+{
+ u16 i;
+
+ if (tx->num_desc) {
+ u8 skip = 0, mode = ahg_mode(tx);
+
+ /* unmap first */
+ sdma_unmap_desc(dd, &tx->descp[0]);
+ /* determine number of AHG descriptors to skip */
+ if (mode > SDMA_AHG_APPLY_UPDATE1)
+ skip = mode >> 1;
+ for (i = 1 + skip; i < tx->num_desc; i++)
+ sdma_unmap_desc(dd, &tx->descp[i]);
+ tx->num_desc = 0;
+ }
+ kfree(tx->coalesce_buf);
+ tx->coalesce_buf = NULL;
+ /* kmalloc'ed descp */
+ if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
+ tx->desc_limit = ARRAY_SIZE(tx->descs);
+ kfree(tx->descp);
+ }
+}
+
+static inline u16 sdma_gethead(struct sdma_engine *sde)
+{
+ struct hfi1_devdata *dd = sde->dd;
+ int use_dmahead;
+ u16 hwhead;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+retry:
+ use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
+ (dd->flags & HFI1_HAS_SDMA_TIMEOUT);
+ hwhead = use_dmahead ?
+ (u16)le64_to_cpu(*sde->head_dma) :
+ (u16)read_sde_csr(sde, SD(HEAD));
+
+ if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
+ u16 cnt;
+ u16 swtail;
+ u16 swhead;
+ int sane;
+
+ swhead = sde->descq_head & sde->sdma_mask;
+ /* this code is really bad for cache line trading */
+ swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+ cnt = sde->descq_cnt;
+
+ if (swhead < swtail)
+ /* not wrapped */
+ sane = (hwhead >= swhead) & (hwhead <= swtail);
+ else if (swhead > swtail)
+ /* wrapped around */
+ sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+ (hwhead <= swtail);
+ else
+ /* empty */
+ sane = (hwhead == swhead);
+
+ if (unlikely(!sane)) {
+ dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
+ sde->this_idx,
+ use_dmahead ? "dma" : "kreg",
+ hwhead, swhead, swtail, cnt);
+ if (use_dmahead) {
+ /* try one more time, using csr */
+ use_dmahead = 0;
+ goto retry;
+ }
+ /* proceed as if no progress */
+ hwhead = swhead;
+ }
+ }
+ return hwhead;
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with head_lock held.
+ */
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
+{
+ struct iowait *wait, *nw;
+ struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
+ unsigned i, n = 0, seq;
+ struct sdma_txreq *stx;
+ struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+ slashstrip(__FILE__), __LINE__, __func__);
+ dd_dev_err(sde->dd, "avail: %u\n", avail);
+#endif
+
+ do {
+ seq = read_seqbegin(&dev->iowait_lock);
+ if (!list_empty(&sde->dmawait)) {
+ /* at least one item */
+ write_seqlock(&dev->iowait_lock);
+ /* Harvest waiters wanting DMA descriptors */
+ list_for_each_entry_safe(
+ wait,
+ nw,
+ &sde->dmawait,
+ list) {
+ u16 num_desc = 0;
+
+ if (!wait->wakeup)
+ continue;
+ if (n == ARRAY_SIZE(waits))
+ break;
+ if (!list_empty(&wait->tx_head)) {
+ stx = list_first_entry(
+ &wait->tx_head,
+ struct sdma_txreq,
+ list);
+ num_desc = stx->num_desc;
+ }
+ if (num_desc > avail)
+ break;
+ avail -= num_desc;
+ list_del_init(&wait->list);
+ waits[n++] = wait;
+ }
+ write_sequnlock(&dev->iowait_lock);
+ break;
+ }
+ } while (read_seqretry(&dev->iowait_lock, seq));
+
+ for (i = 0; i < n; i++)
+ waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
+}
+
+/* head_lock must be held */
+static void sdma_make_progress(struct sdma_engine *sde, u64 status)
+{
+ struct sdma_txreq *txp = NULL;
+ int progress = 0;
+ u16 hwhead, swhead;
+ int idle_check_done = 0;
+
+ hwhead = sdma_gethead(sde);
+
+ /* The reason for some of the complexity of this code is that
+ * not all descriptors have corresponding txps. So, we have to
+ * be able to skip over descs until we wander into the range of
+ * the next txp on the list.
+ */
+
+retry:
+ txp = get_txhead(sde);
+ swhead = sde->descq_head & sde->sdma_mask;
+ trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+ while (swhead != hwhead) {
+ /* advance head, wrap if needed */
+ swhead = ++sde->descq_head & sde->sdma_mask;
+
+ /* if now past this txp's descs, do the callback */
+ if (txp && txp->next_descq_idx == swhead) {
+ /* remove from list */
+ sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+ complete_tx(sde, txp, SDMA_TXREQ_S_OK);
+ /* see if there is another txp */
+ txp = get_txhead(sde);
+ }
+ trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+ progress++;
+ }
+
+ /*
+ * The SDMA idle interrupt is not guaranteed to be ordered with respect
+ * to updates to the the dma_head location in host memory. The head
+ * value read might not be fully up to date. If there are pending
+ * descriptors and the SDMA idle interrupt fired then read from the
+ * CSR SDMA head instead to get the latest value from the hardware.
+ * The hardware SDMA head should be read at most once in this invocation
+ * of sdma_make_progress(..) which is ensured by idle_check_done flag
+ */
+ if ((status & sde->idle_mask) && !idle_check_done) {
+ u16 swtail;
+
+ swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+ if (swtail != hwhead) {
+ hwhead = (u16)read_sde_csr(sde, SD(HEAD));
+ idle_check_done = 1;
+ goto retry;
+ }
+ }
+
+ sde->last_status = status;
+ if (progress)
+ sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+/*
+ * sdma_engine_interrupt() - interrupt handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ *
+ * Status is a mask of the 3 possible interrupts for this engine. It will
+ * contain bits _only_ for this SDMA engine. It will contain at least one
+ * bit, it may contain more.
+ */
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
+{
+ trace_hfi1_sdma_engine_interrupt(sde, status);
+ write_seqlock(&sde->head_lock);
+ sdma_set_desc_cnt(sde, sdma_desct_intr);
+ if (status & sde->idle_mask)
+ sde->idle_int_cnt++;
+ else if (status & sde->progress_mask)
+ sde->progress_int_cnt++;
+ else if (status & sde->int_mask)
+ sde->sdma_int_cnt++;
+ sdma_make_progress(sde, status);
+ write_sequnlock(&sde->head_lock);
+}
+
+/**
+ * sdma_engine_error() - error handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ */
+void sdma_engine_error(struct sdma_engine *sde, u64 status)
+{
+ unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
+ sde->this_idx,
+ (unsigned long long)status,
+ sdma_state_names[sde->state.current_state]);
+#endif
+ spin_lock_irqsave(&sde->tail_lock, flags);
+ write_seqlock(&sde->head_lock);
+ if (status & ALL_SDMA_ENG_HALT_ERRS)
+ __sdma_process_event(sde, sdma_event_e60_hw_halted);
+ if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
+ dd_dev_err(sde->dd,
+ "SDMA (%u) engine error: 0x%llx state %s\n",
+ sde->this_idx,
+ (unsigned long long)status,
+ sdma_state_names[sde->state.current_state]);
+ dump_sdma_state(sde);
+ }
+ write_sequnlock(&sde->head_lock);
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
+{
+ u64 set_senddmactrl = 0;
+ u64 clr_senddmactrl = 0;
+ unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
+ sde->this_idx,
+ (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
+ (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
+ (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
+ (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
+#endif
+
+ if (op & SDMA_SENDCTRL_OP_ENABLE)
+ set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+ else
+ clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+
+ if (op & SDMA_SENDCTRL_OP_INTENABLE)
+ set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+ else
+ clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+
+ if (op & SDMA_SENDCTRL_OP_HALT)
+ set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+ else
+ clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+
+ spin_lock_irqsave(&sde->senddmactrl_lock, flags);
+
+ sde->p_senddmactrl |= set_senddmactrl;
+ sde->p_senddmactrl &= ~clr_senddmactrl;
+
+ if (op & SDMA_SENDCTRL_OP_CLEANUP)
+ write_sde_csr(sde, SD(CTRL),
+ sde->p_senddmactrl |
+ SD(CTRL_SDMA_CLEANUP_SMASK));
+ else
+ write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
+
+ spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ sdma_dumpstate(sde);
+#endif
+}
+
+static void sdma_setlengen(struct sdma_engine *sde)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+ /*
+ * Set SendDmaLenGen and clear-then-set the MSB of the generation
+ * count to enable generation checking and load the internal
+ * generation counter.
+ */
+ write_sde_csr(sde, SD(LEN_GEN),
+ (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT));
+ write_sde_csr(sde, SD(LEN_GEN),
+ ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) |
+ (4ULL << SD(LEN_GEN_GENERATION_SHIFT)));
+}
+
+static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
+{
+ /* Commit writes to memory and advance the tail on the chip */
+ smp_wmb(); /* see get_txhead() */
+ writeq(tail, sde->tail_csr);
+}
+
+/*
+ * This is called when changing to state s10_hw_start_up_halt_wait as
+ * a result of send buffer errors or send DMA descriptor errors.
+ */
+static void sdma_hw_start_up(struct sdma_engine *sde)
+{
+ u64 reg;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+ sdma_setlengen(sde);
+ sdma_update_tail(sde, 0); /* Set SendDmaTail */
+ *sde->head_dma = 0;
+
+ reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
+ SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
+ write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+/*
+ * set_sdma_integrity
+ *
+ * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
+ */
+static void set_sdma_integrity(struct sdma_engine *sde)
+{
+ struct hfi1_devdata *dd = sde->dd;
+ u64 reg;
+
+ if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY)))
+ return;
+
+ reg = hfi1_pkt_base_sdma_integrity(dd);
+
+ if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
+ CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+ else
+ SET_STATIC_RATE_CONTROL_SMASK(reg);
+
+ write_sde_csr(sde, SD(CHECK_ENABLE), reg);
+}
+
+static void init_sdma_regs(
+ struct sdma_engine *sde,
+ u32 credits,
+ uint idle_cnt)
+{
+ u8 opval, opmask;
+#ifdef CONFIG_SDMA_VERBOSITY
+ struct hfi1_devdata *dd = sde->dd;
+
+ dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+ sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+ write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
+ sdma_setlengen(sde);
+ sdma_update_tail(sde, 0); /* Set SendDmaTail */
+ write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
+ write_sde_csr(sde, SD(DESC_CNT), 0);
+ write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
+ write_sde_csr(sde, SD(MEMORY),
+ ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
+ ((u64)(credits * sde->this_idx) <<
+ SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
+ write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
+ set_sdma_integrity(sde);
+ opmask = OPCODE_CHECK_MASK_DISABLED;
+ opval = OPCODE_CHECK_VAL_DISABLED;
+ write_sde_csr(sde, SD(CHECK_OPCODE),
+ (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
+ (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
+}
+
+#ifdef CONFIG_SDMA_VERBOSITY
+
+#define sdma_dumpstate_helper0(reg) do { \
+ csr = read_csr(sde->dd, reg); \
+ dd_dev_err(sde->dd, "%36s 0x%016llx\n", #reg, csr); \
+ } while (0)
+
+#define sdma_dumpstate_helper(reg) do { \
+ csr = read_sde_csr(sde, reg); \
+ dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
+ #reg, sde->this_idx, csr); \
+ } while (0)
+
+#define sdma_dumpstate_helper2(reg) do { \
+ csr = read_csr(sde->dd, reg + (8 * i)); \
+ dd_dev_err(sde->dd, "%33s_%02u 0x%016llx\n", \
+ #reg, i, csr); \
+ } while (0)
+
+void sdma_dumpstate(struct sdma_engine *sde)
+{
+ u64 csr;
+ unsigned i;
+
+ sdma_dumpstate_helper(SD(CTRL));
+ sdma_dumpstate_helper(SD(STATUS));
+ sdma_dumpstate_helper0(SD(ERR_STATUS));
+ sdma_dumpstate_helper0(SD(ERR_MASK));
+ sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
+ sdma_dumpstate_helper(SD(ENG_ERR_MASK));
+
+ for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
+ sdma_dumpstate_helper2(CCE_INT_STATUS);
+ sdma_dumpstate_helper2(CCE_INT_MASK);
+ sdma_dumpstate_helper2(CCE_INT_BLOCKED);
+ }
+
+ sdma_dumpstate_helper(SD(TAIL));
+ sdma_dumpstate_helper(SD(HEAD));
+ sdma_dumpstate_helper(SD(PRIORITY_THLD));
+ sdma_dumpstate_helper(SD(IDLE_CNT));
+ sdma_dumpstate_helper(SD(RELOAD_CNT));
+ sdma_dumpstate_helper(SD(DESC_CNT));
+ sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
+ sdma_dumpstate_helper(SD(MEMORY));
+ sdma_dumpstate_helper0(SD(ENGINES));
+ sdma_dumpstate_helper0(SD(MEM_SIZE));
+ /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS); */
+ sdma_dumpstate_helper(SD(BASE_ADDR));
+ sdma_dumpstate_helper(SD(LEN_GEN));
+ sdma_dumpstate_helper(SD(HEAD_ADDR));
+ sdma_dumpstate_helper(SD(CHECK_ENABLE));
+ sdma_dumpstate_helper(SD(CHECK_VL));
+ sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
+ sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
+ sdma_dumpstate_helper(SD(CHECK_SLID));
+ sdma_dumpstate_helper(SD(CHECK_OPCODE));
+}
+#endif
+
+static void dump_sdma_state(struct sdma_engine *sde)
+{
+ struct hw_sdma_desc *descq;
+ struct hw_sdma_desc *descqp;
+ u64 desc[2];
+ u64 addr;
+ u8 gen;
+ u16 len;
+ u16 head, tail, cnt;
+
+ head = sde->descq_head & sde->sdma_mask;
+ tail = sde->descq_tail & sde->sdma_mask;
+ cnt = sdma_descq_freecnt(sde);
+ descq = sde->descq;
+
+ dd_dev_err(sde->dd,
+ "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
+ sde->this_idx, head, tail, cnt,
+ !list_empty(&sde->flushlist));
+
+ /* print info for each entry in the descriptor queue */
+ while (head != tail) {
+ char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+ descqp = &sde->descq[head];
+ desc[0] = le64_to_cpu(descqp->qw[0]);
+ desc[1] = le64_to_cpu(descqp->qw[1]);
+ flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+ flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+ 'H' : '-';
+ flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+ flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+ addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+ & SDMA_DESC0_PHY_ADDR_MASK;
+ gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+ & SDMA_DESC1_GENERATION_MASK;
+ len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+ & SDMA_DESC0_BYTE_COUNT_MASK;
+ dd_dev_err(sde->dd,
+ "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+ head, flags, addr, gen, len);
+ dd_dev_err(sde->dd,
+ "\tdesc0:0x%016llx desc1 0x%016llx\n",
+ desc[0], desc[1]);
+ if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+ dd_dev_err(sde->dd,
+ "\taidx: %u amode: %u alen: %u\n",
+ (u8)((desc[1] &
+ SDMA_DESC1_HEADER_INDEX_SMASK) >>
+ SDMA_DESC1_HEADER_INDEX_SHIFT),
+ (u8)((desc[1] &
+ SDMA_DESC1_HEADER_MODE_SMASK) >>
+ SDMA_DESC1_HEADER_MODE_SHIFT),
+ (u8)((desc[1] &
+ SDMA_DESC1_HEADER_DWS_SMASK) >>
+ SDMA_DESC1_HEADER_DWS_SHIFT));
+ head++;
+ head &= sde->sdma_mask;
+ }
+}
+
+#define SDE_FMT \
+ "SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
+/**
+ * sdma_seqfile_dump_sde() - debugfs dump of sde
+ * @s: seq file
+ * @sde: send dma engine to dump
+ *
+ * This routine dumps the sde to the indicated seq file.
+ */
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
+{
+ u16 head, tail;
+ struct hw_sdma_desc *descqp;
+ u64 desc[2];
+ u64 addr;
+ u8 gen;
+ u16 len;
+
+ head = sde->descq_head & sde->sdma_mask;
+ tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+ seq_printf(s, SDE_FMT, sde->this_idx,
+ sde->cpu,
+ sdma_state_name(sde->state.current_state),
+ (unsigned long long)read_sde_csr(sde, SD(CTRL)),
+ (unsigned long long)read_sde_csr(sde, SD(STATUS)),
+ (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)),
+ (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail,
+ (unsigned long long)read_sde_csr(sde, SD(HEAD)), head,
+ (unsigned long long)le64_to_cpu(*sde->head_dma),
+ (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
+ (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
+ (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
+ (unsigned long long)sde->last_status,
+ (unsigned long long)sde->ahg_bits,
+ sde->tx_tail,
+ sde->tx_head,
+ sde->descq_tail,
+ sde->descq_head,
+ !list_empty(&sde->flushlist),
+ sde->descq_full_count,
+ (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
+
+ /* print info for each entry in the descriptor queue */
+ while (head != tail) {
+ char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+ descqp = &sde->descq[head];
+ desc[0] = le64_to_cpu(descqp->qw[0]);
+ desc[1] = le64_to_cpu(descqp->qw[1]);
+ flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+ flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+ 'H' : '-';
+ flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+ flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+ addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+ & SDMA_DESC0_PHY_ADDR_MASK;
+ gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+ & SDMA_DESC1_GENERATION_MASK;
+ len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+ & SDMA_DESC0_BYTE_COUNT_MASK;
+ seq_printf(s,
+ "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+ head, flags, addr, gen, len);
+ if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+ seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
+ (u8)((desc[1] &
+ SDMA_DESC1_HEADER_INDEX_SMASK) >>
+ SDMA_DESC1_HEADER_INDEX_SHIFT),
+ (u8)((desc[1] &
+ SDMA_DESC1_HEADER_MODE_SMASK) >>
+ SDMA_DESC1_HEADER_MODE_SHIFT));
+ head = (head + 1) & sde->sdma_mask;
+ }
+}
+
+/*
+ * add the generation number into
+ * the qw1 and return
+ */
+static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
+{
+ u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
+
+ qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
+ qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
+ << SDMA_DESC1_GENERATION_SHIFT;
+ return qw1;
+}
+
+/*
+ * This routine submits the indicated tx
+ *
+ * Space has already been guaranteed and
+ * tail side of ring is locked.
+ *
+ * The hardware tail update is done
+ * in the caller and that is facilitated
+ * by returning the new tail.
+ *
+ * There is special case logic for ahg
+ * to not add the generation number for
+ * up to 2 descriptors that follow the
+ * first descriptor.
+ *
+ */
+static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
+{
+ int i;
+ u16 tail;
+ struct sdma_desc *descp = tx->descp;
+ u8 skip = 0, mode = ahg_mode(tx);
+
+ tail = sde->descq_tail & sde->sdma_mask;
+ sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+ sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
+ trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
+ tail, &sde->descq[tail]);
+ tail = ++sde->descq_tail & sde->sdma_mask;
+ descp++;
+ if (mode > SDMA_AHG_APPLY_UPDATE1)
+ skip = mode >> 1;
+ for (i = 1; i < tx->num_desc; i++, descp++) {
+ u64 qw1;
+
+ sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+ if (skip) {
+ /* edits don't have generation */
+ qw1 = descp->qw[1];
+ skip--;
+ } else {
+ /* replace generation with real one for non-edits */
+ qw1 = add_gen(sde, descp->qw[1]);
+ }
+ sde->descq[tail].qw[1] = cpu_to_le64(qw1);
+ trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
+ tail, &sde->descq[tail]);
+ tail = ++sde->descq_tail & sde->sdma_mask;
+ }
+ tx->next_descq_idx = tail;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ tx->sn = sde->tail_sn++;
+ trace_hfi1_sdma_in_sn(sde, tx->sn);
+ WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
+#endif
+ sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
+ sde->desc_avail -= tx->num_desc;
+ return tail;
+}
+
+/*
+ * Check for progress
+ */
+static int sdma_check_progress(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx)
+{
+ int ret;
+
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ if (tx->num_desc <= sde->desc_avail)
+ return -EAGAIN;
+ /* pulse the head_lock */
+ if (wait && wait->sleep) {
+ unsigned seq;
+
+ seq = raw_seqcount_begin(
+ (const seqcount_t *)&sde->head_lock.seqcount);
+ ret = wait->sleep(sde, wait, tx, seq);
+ if (ret == -EAGAIN)
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ } else {
+ ret = -EBUSY;
+ }
+ return ret;
+}
+
+/**
+ * sdma_send_txreq() - submit a tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx: sdma_txreq to submit
+ *
+ * The call submits the tx into the ring. If a iowait structure is non-NULL
+ * the packet will be queued to the list in wait.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
+ * ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txreq(struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx)
+{
+ int ret = 0;
+ u16 tail;
+ unsigned long flags;
+
+ /* user should have supplied entire packet */
+ if (unlikely(tx->tlen))
+ return -EINVAL;
+ tx->wait = wait;
+ spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+ if (unlikely(!__sdma_running(sde)))
+ goto unlock_noconn;
+ if (unlikely(tx->num_desc > sde->desc_avail))
+ goto nodesc;
+ tail = submit_tx(sde, tx);
+ if (wait)
+ iowait_sdma_inc(wait);
+ sdma_update_tail(sde, tail);
+unlock:
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+ return ret;
+unlock_noconn:
+ if (wait)
+ iowait_sdma_inc(wait);
+ tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ tx->sn = sde->tail_sn++;
+ trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+ spin_lock(&sde->flushlist_lock);
+ list_add_tail(&tx->list, &sde->flushlist);
+ spin_unlock(&sde->flushlist_lock);
+ if (wait) {
+ wait->tx_count++;
+ wait->count += tx->num_desc;
+ }
+ schedule_work(&sde->flush_worker);
+ ret = -ECOMM;
+ goto unlock;
+nodesc:
+ ret = sdma_check_progress(sde, wait, tx);
+ if (ret == -EAGAIN) {
+ ret = 0;
+ goto retry;
+ }
+ sde->descq_full_count++;
+ goto unlock;
+}
+
+/**
+ * sdma_send_txlist() - submit a list of tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx_list: list of sdma_txreqs to submit
+ *
+ * The call submits the list into the ring.
+ *
+ * If the iowait structure is non-NULL and not equal to the iowait list
+ * the unprocessed part of the list will be appended to the list in wait.
+ *
+ * In all cases, the tx_list will be updated so the head of the tx_list is
+ * the list of descriptors that have yet to be transmitted.
+ *
+ * The intent of this call is to provide a more efficient
+ * way of submitting multiple packets to SDMA while holding the tail
+ * side locking.
+ *
+ * Return:
+ * > 0 - Success (value is number of sdma_txreq's submitted),
+ * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
+ struct list_head *tx_list)
+{
+ struct sdma_txreq *tx, *tx_next;
+ int ret = 0;
+ unsigned long flags;
+ u16 tail = INVALID_TAIL;
+ int count = 0;
+
+ spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+ list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+ tx->wait = wait;
+ if (unlikely(!__sdma_running(sde)))
+ goto unlock_noconn;
+ if (unlikely(tx->num_desc > sde->desc_avail))
+ goto nodesc;
+ if (unlikely(tx->tlen)) {
+ ret = -EINVAL;
+ goto update_tail;
+ }
+ list_del_init(&tx->list);
+ tail = submit_tx(sde, tx);
+ count++;
+ if (tail != INVALID_TAIL &&
+ (count & SDMA_TAIL_UPDATE_THRESH) == 0) {
+ sdma_update_tail(sde, tail);
+ tail = INVALID_TAIL;
+ }
+ }
+update_tail:
+ if (wait)
+ iowait_sdma_add(wait, count);
+ if (tail != INVALID_TAIL)
+ sdma_update_tail(sde, tail);
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+ return ret == 0 ? count : ret;
+unlock_noconn:
+ spin_lock(&sde->flushlist_lock);
+ list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+ tx->wait = wait;
+ list_del_init(&tx->list);
+ if (wait)
+ iowait_sdma_inc(wait);
+ tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ tx->sn = sde->tail_sn++;
+ trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+ list_add_tail(&tx->list, &sde->flushlist);
+ if (wait) {
+ wait->tx_count++;
+ wait->count += tx->num_desc;
+ }
+ }
+ spin_unlock(&sde->flushlist_lock);
+ schedule_work(&sde->flush_worker);
+ ret = -ECOMM;
+ goto update_tail;
+nodesc:
+ ret = sdma_check_progress(sde, wait, tx);
+ if (ret == -EAGAIN) {
+ ret = 0;
+ goto retry;
+ }
+ sde->descq_full_count++;
+ goto update_tail;
+}
+
+static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&sde->tail_lock, flags);
+ write_seqlock(&sde->head_lock);
+
+ __sdma_process_event(sde, event);
+
+ if (sde->state.current_state == sdma_state_s99_running)
+ sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+
+ write_sequnlock(&sde->head_lock);
+ spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void __sdma_process_event(struct sdma_engine *sde,
+ enum sdma_events event)
+{
+ struct sdma_state *ss = &sde->state;
+ int need_progress = 0;
+
+ /* CONFIG SDMA temporary */
+#ifdef CONFIG_SDMA_VERBOSITY
+ dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
+ sdma_state_names[ss->current_state],
+ sdma_event_names[event]);
+#endif
+
+ switch (ss->current_state) {
+ case sdma_state_s00_hw_down:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ break;
+ case sdma_event_e30_go_running:
+ /*
+ * If down, but running requested (usually result
+ * of link up, then we need to start up.
+ * This can happen when hw down is requested while
+ * bringing the link up with traffic active on
+ * 7220, e.g.
+ */
+ ss->go_s99_running = 1;
+ /* fall through and start dma engine */
+ case sdma_event_e10_go_hw_start:
+ /* This reference means the state machine is started */
+ sdma_get(&sde->state);
+ sdma_set_state(sde,
+ sdma_state_s10_hw_start_up_halt_wait);
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e40_sw_cleaned:
+ sdma_sw_tear_down(sde);
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s10_hw_start_up_halt_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_sw_tear_down(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ sdma_set_state(sde,
+ sdma_state_s15_hw_start_up_clean_wait);
+ sdma_start_hw_clean_up(sde);
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ schedule_work(&sde->err_halt_worker);
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s15_hw_start_up_clean_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_sw_tear_down(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ sdma_hw_start_up(sde);
+ sdma_set_state(sde, ss->go_s99_running ?
+ sdma_state_s99_running :
+ sdma_state_s20_idle);
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s20_idle:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ sdma_sw_tear_down(sde);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ sdma_set_state(sde, sdma_state_s99_running);
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+ schedule_work(&sde->err_halt_worker);
+ break;
+ case sdma_event_e70_go_idle:
+ break;
+ case sdma_event_e85_link_down:
+ /* fall through */
+ case sdma_event_e80_hw_freeze:
+ sdma_set_state(sde, sdma_state_s80_hw_freeze);
+ atomic_dec(&sde->dd->sdma_unfreeze_count);
+ wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s30_sw_clean_up_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
+ sdma_start_hw_clean_up(sde);
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s40_hw_clean_up_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ sdma_hw_start_up(sde);
+ sdma_set_state(sde, ss->go_s99_running ?
+ sdma_state_s99_running :
+ sdma_state_s20_idle);
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s50_hw_halt_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ schedule_work(&sde->err_halt_worker);
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s60_idle_halt_wait:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ schedule_work(&sde->err_halt_worker);
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s80_hw_freeze:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s82_freeze_sw_clean:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ ss->go_s99_running = 1;
+ break;
+ case sdma_event_e40_sw_cleaned:
+ /* notify caller this engine is done cleaning */
+ atomic_dec(&sde->dd->sdma_unfreeze_count);
+ wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ break;
+ case sdma_event_e70_go_idle:
+ ss->go_s99_running = 0;
+ break;
+ case sdma_event_e80_hw_freeze:
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ sdma_hw_start_up(sde);
+ sdma_set_state(sde, ss->go_s99_running ?
+ sdma_state_s99_running :
+ sdma_state_s20_idle);
+ break;
+ case sdma_event_e85_link_down:
+ break;
+ case sdma_event_e90_sw_halted:
+ break;
+ }
+ break;
+
+ case sdma_state_s99_running:
+ switch (event) {
+ case sdma_event_e00_go_hw_down:
+ sdma_set_state(sde, sdma_state_s00_hw_down);
+ tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+ break;
+ case sdma_event_e10_go_hw_start:
+ break;
+ case sdma_event_e15_hw_halt_done:
+ break;
+ case sdma_event_e25_hw_clean_up_done:
+ break;
+ case sdma_event_e30_go_running:
+ break;
+ case sdma_event_e40_sw_cleaned:
+ break;
+ case sdma_event_e50_hw_cleaned:
+ break;
+ case sdma_event_e60_hw_halted:
+ need_progress = 1;
+ sdma_err_progress_check_schedule(sde);
+ case sdma_event_e90_sw_halted:
+ /*
+ * SW initiated halt does not perform engines
+ * progress check
+ */
+ sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+ schedule_work(&sde->err_halt_worker);
+ break;
+ case sdma_event_e70_go_idle:
+ sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
+ break;
+ case sdma_event_e85_link_down:
+ ss->go_s99_running = 0;
+ /* fall through */
+ case sdma_event_e80_hw_freeze:
+ sdma_set_state(sde, sdma_state_s80_hw_freeze);
+ atomic_dec(&sde->dd->sdma_unfreeze_count);
+ wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+ break;
+ case sdma_event_e81_hw_frozen:
+ break;
+ case sdma_event_e82_hw_unfreeze:
+ break;
+ }
+ break;
+ }
+
+ ss->last_event = event;
+ if (need_progress)
+ sdma_make_progress(sde, 0);
+}
+
+/*
+ * _extend_sdma_tx_descs() - helper to extend txreq
+ *
+ * This is called once the initial nominal allocation
+ * of descriptors in the sdma_txreq is exhausted.
+ *
+ * The code will bump the allocation up to the max
+ * of MAX_DESC (64) descriptors. There doesn't seem
+ * much point in an interim step. The last descriptor
+ * is reserved for coalesce buffer in order to support
+ * cases where input packet has >MAX_DESC iovecs.
+ *
+ */
+static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+ int i;
+
+ /* Handle last descriptor */
+ if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
+ /* if tlen is 0, it is for padding, release last descriptor */
+ if (!tx->tlen) {
+ tx->desc_limit = MAX_DESC;
+ } else if (!tx->coalesce_buf) {
+ /* allocate coalesce buffer with space for padding */
+ tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
+ GFP_ATOMIC);
+ if (!tx->coalesce_buf)
+ goto enomem;
+ tx->coalesce_idx = 0;
+ }
+ return 0;
+ }
+
+ if (unlikely(tx->num_desc == MAX_DESC))
+ goto enomem;
+
+ tx->descp = kmalloc_array(
+ MAX_DESC,
+ sizeof(struct sdma_desc),
+ GFP_ATOMIC);
+ if (!tx->descp)
+ goto enomem;
+
+ /* reserve last descriptor for coalescing */
+ tx->desc_limit = MAX_DESC - 1;
+ /* copy ones already built */
+ for (i = 0; i < tx->num_desc; i++)
+ tx->descp[i] = tx->descs[i];
+ return 0;
+enomem:
+ sdma_txclean(dd, tx);
+ return -ENOMEM;
+}
+
+/*
+ * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
+ *
+ * This is called once the initial nominal allocation of descriptors
+ * in the sdma_txreq is exhausted.
+ *
+ * This function calls _extend_sdma_tx_descs to extend or allocate
+ * coalesce buffer. If there is a allocated coalesce buffer, it will
+ * copy the input packet data into the coalesce buffer. It also adds
+ * coalesce buffer descriptor once when whole packet is received.
+ *
+ * Return:
+ * <0 - error
+ * 0 - coalescing, don't populate descriptor
+ * 1 - continue with populating descriptor
+ */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+ int type, void *kvaddr, struct page *page,
+ unsigned long offset, u16 len)
+{
+ int pad_len, rval;
+ dma_addr_t addr;
+
+ rval = _extend_sdma_tx_descs(dd, tx);
+ if (rval) {
+ sdma_txclean(dd, tx);
+ return rval;
+ }
+
+ /* If coalesce buffer is allocated, copy data into it */
+ if (tx->coalesce_buf) {
+ if (type == SDMA_MAP_NONE) {
+ sdma_txclean(dd, tx);
+ return -EINVAL;
+ }
+
+ if (type == SDMA_MAP_PAGE) {
+ kvaddr = kmap(page);
+ kvaddr += offset;
+ } else if (WARN_ON(!kvaddr)) {
+ sdma_txclean(dd, tx);
+ return -EINVAL;
+ }
+
+ memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
+ tx->coalesce_idx += len;
+ if (type == SDMA_MAP_PAGE)
+ kunmap(page);
+
+ /* If there is more data, return */
+ if (tx->tlen - tx->coalesce_idx)
+ return 0;
+
+ /* Whole packet is received; add any padding */
+ pad_len = tx->packet_len & (sizeof(u32) - 1);
+ if (pad_len) {
+ pad_len = sizeof(u32) - pad_len;
+ memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
+ /* padding is taken care of for coalescing case */
+ tx->packet_len += pad_len;
+ tx->tlen += pad_len;
+ }
+
+ /* dma map the coalesce buffer */
+ addr = dma_map_single(&dd->pcidev->dev,
+ tx->coalesce_buf,
+ tx->tlen,
+ DMA_TO_DEVICE);
+
+ if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+ sdma_txclean(dd, tx);
+ return -ENOSPC;
+ }
+
+ /* Add descriptor for coalesce buffer */
+ tx->desc_limit = MAX_DESC;
+ return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
+ addr, tx->tlen);
+ }
+
+ return 1;
+}
+
+/* Update sdes when the lmc changes */
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
+{
+ struct sdma_engine *sde;
+ int i;
+ u64 sreg;
+
+ sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
+ SD(CHECK_SLID_MASK_SHIFT)) |
+ (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
+ SD(CHECK_SLID_VALUE_SHIFT));
+
+ for (i = 0; i < dd->num_sdma; i++) {
+ hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
+ i, (u32)sreg);
+ sde = &dd->per_sdma[i];
+ write_sde_csr(sde, SD(CHECK_SLID), sreg);
+ }
+}
+
+/* tx not dword sized - pad */
+int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+ int rval = 0;
+
+ tx->num_desc++;
+ if ((unlikely(tx->num_desc == tx->desc_limit))) {
+ rval = _extend_sdma_tx_descs(dd, tx);
+ if (rval) {
+ sdma_txclean(dd, tx);
+ return rval;
+ }
+ }
+ /* finish the one just added */
+ make_tx_sdma_desc(
+ tx,
+ SDMA_MAP_NONE,
+ dd->sdma_pad_phys,
+ sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
+ _sdma_close_tx(dd, tx);
+ return rval;
+}
+
+/*
+ * Add ahg to the sdma_txreq
+ *
+ * The logic will consume up to 3
+ * descriptors at the beginning of
+ * sdma_txreq.
+ */
+void _sdma_txreq_ahgadd(
+ struct sdma_txreq *tx,
+ u8 num_ahg,
+ u8 ahg_entry,
+ u32 *ahg,
+ u8 ahg_hlen)
+{
+ u32 i, shift = 0, desc = 0;
+ u8 mode;
+
+ WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
+ /* compute mode */
+ if (num_ahg == 1)
+ mode = SDMA_AHG_APPLY_UPDATE1;
+ else if (num_ahg <= 5)
+ mode = SDMA_AHG_APPLY_UPDATE2;
+ else
+ mode = SDMA_AHG_APPLY_UPDATE3;
+ tx->num_desc++;
+ /* initialize to consumed descriptors to zero */
+ switch (mode) {
+ case SDMA_AHG_APPLY_UPDATE3:
+ tx->num_desc++;
+ tx->descs[2].qw[0] = 0;
+ tx->descs[2].qw[1] = 0;
+ /* FALLTHROUGH */
+ case SDMA_AHG_APPLY_UPDATE2:
+ tx->num_desc++;
+ tx->descs[1].qw[0] = 0;
+ tx->descs[1].qw[1] = 0;
+ break;
+ }
+ ahg_hlen >>= 2;
+ tx->descs[0].qw[1] |=
+ (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+ << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+ (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
+ << SDMA_DESC1_HEADER_DWS_SHIFT) |
+ (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
+ << SDMA_DESC1_HEADER_MODE_SHIFT) |
+ (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
+ << SDMA_DESC1_HEADER_UPDATE1_SHIFT);
+ for (i = 0; i < (num_ahg - 1); i++) {
+ if (!shift && !(i & 2))
+ desc++;
+ tx->descs[desc].qw[!!(i & 2)] |=
+ (((u64)ahg[i + 1])
+ << shift);
+ shift = (shift + 32) & 63;
+ }
+}
+
+/**
+ * sdma_ahg_alloc - allocate an AHG entry
+ * @sde: engine to allocate from
+ *
+ * Return:
+ * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
+ * -ENOSPC if an entry is not available
+ */
+int sdma_ahg_alloc(struct sdma_engine *sde)
+{
+ int nr;
+ int oldbit;
+
+ if (!sde) {
+ trace_hfi1_ahg_allocate(sde, -EINVAL);
+ return -EINVAL;
+ }
+ while (1) {
+ nr = ffz(ACCESS_ONCE(sde->ahg_bits));
+ if (nr > 31) {
+ trace_hfi1_ahg_allocate(sde, -ENOSPC);
+ return -ENOSPC;
+ }
+ oldbit = test_and_set_bit(nr, &sde->ahg_bits);
+ if (!oldbit)
+ break;
+ cpu_relax();
+ }
+ trace_hfi1_ahg_allocate(sde, nr);
+ return nr;
+}
+
+/**
+ * sdma_ahg_free - free an AHG entry
+ * @sde: engine to return AHG entry
+ * @ahg_index: index to free
+ *
+ * This routine frees the indicate AHG entry.
+ */
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
+{
+ if (!sde)
+ return;
+ trace_hfi1_ahg_deallocate(sde, ahg_index);
+ if (ahg_index < 0 || ahg_index > 31)
+ return;
+ clear_bit(ahg_index, &sde->ahg_bits);
+}
+
+/*
+ * SPC freeze handling for SDMA engines. Called when the driver knows
+ * the SPC is going into a freeze but before the freeze is fully
+ * settled. Generally an error interrupt.
+ *
+ * This event will pull the engine out of running so no more entries can be
+ * added to the engine's queue.
+ */
+void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
+{
+ int i;
+ enum sdma_events event = link_down ? sdma_event_e85_link_down :
+ sdma_event_e80_hw_freeze;
+
+ /* set up the wait but do not wait here */
+ atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+ /* tell all engines to stop running and wait */
+ for (i = 0; i < dd->num_sdma; i++)
+ sdma_process_event(&dd->per_sdma[i], event);
+
+ /* sdma_freeze() will wait for all engines to have stopped */
+}
+
+/*
+ * SPC freeze handling for SDMA engines. Called when the driver knows
+ * the SPC is fully frozen.
+ */
+void sdma_freeze(struct hfi1_devdata *dd)
+{
+ int i;
+ int ret;
+
+ /*
+ * Make sure all engines have moved out of the running state before
+ * continuing.
+ */
+ ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
+ atomic_read(&dd->sdma_unfreeze_count) <=
+ 0);
+ /* interrupted or count is negative, then unloading - just exit */
+ if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
+ return;
+
+ /* set up the count for the next wait */
+ atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+ /* tell all engines that the SPC is frozen, they can start cleaning */
+ for (i = 0; i < dd->num_sdma; i++)
+ sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
+
+ /*
+ * Wait for everyone to finish software clean before exiting. The
+ * software clean will read engine CSRs, so must be completed before
+ * the next step, which will clear the engine CSRs.
+ */
+ (void)wait_event_interruptible(dd->sdma_unfreeze_wq,
+ atomic_read(&dd->sdma_unfreeze_count) <= 0);
+ /* no need to check results - done no matter what */
+}
+
+/*
+ * SPC freeze handling for the SDMA engines. Called after the SPC is unfrozen.
+ *
+ * The SPC freeze acts like a SDMA halt and a hardware clean combined. All
+ * that is left is a software clean. We could do it after the SPC is fully
+ * frozen, but then we'd have to add another state to wait for the unfreeze.
+ * Instead, just defer the software clean until the unfreeze step.
+ */
+void sdma_unfreeze(struct hfi1_devdata *dd)
+{
+ int i;
+
+ /* tell all engines start freeze clean up */
+ for (i = 0; i < dd->num_sdma; i++)
+ sdma_process_event(&dd->per_sdma[i],
+ sdma_event_e82_hw_unfreeze);
+}
+
+/**
+ * _sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ */
+void _sdma_engine_progress_schedule(
+ struct sdma_engine *sde)
+{
+ trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
+ /* assume we have selected a good cpu */
+ write_csr(sde->dd,
+ CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)),
+ sde->progress_mask);
+}
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
new file mode 100644
index 000000000000..8f50c99fe711
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -0,0 +1,1082 @@
+#ifndef _HFI1_SDMA_H
+#define _HFI1_SDMA_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+
+#include "hfi.h"
+#include "verbs.h"
+#include "sdma_txreq.h"
+
+/* Hardware limit */
+#define MAX_DESC 64
+/* Hardware limit for SDMA packet size */
+#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
+
+#define SDMA_TXREQ_S_OK 0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED 2
+#define SDMA_TXREQ_S_SHUTDOWN 3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT 0x0001
+#define SDMA_TXREQ_F_AHG_COPY 0x0002
+#define SDMA_TXREQ_F_USE_AHG 0x0004
+
+#define SDMA_MAP_NONE 0
+#define SDMA_MAP_SINGLE 1
+#define SDMA_MAP_PAGE 2
+
+#define SDMA_AHG_VALUE_MASK 0xffff
+#define SDMA_AHG_VALUE_SHIFT 0
+#define SDMA_AHG_INDEX_MASK 0xf
+#define SDMA_AHG_INDEX_SHIFT 16
+#define SDMA_AHG_FIELD_LEN_MASK 0xf
+#define SDMA_AHG_FIELD_LEN_SHIFT 20
+#define SDMA_AHG_FIELD_START_MASK 0x1f
+#define SDMA_AHG_FIELD_START_SHIFT 24
+#define SDMA_AHG_UPDATE_ENABLE_MASK 0x1
+#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
+
+/* AHG modes */
+
+/*
+ * Be aware the ordering and values
+ * for SDMA_AHG_APPLY_UPDATE[123]
+ * are assumed in generating a skip
+ * count in submit_tx() in sdma.c
+ */
+#define SDMA_AHG_NO_AHG 0
+#define SDMA_AHG_COPY 1
+#define SDMA_AHG_APPLY_UPDATE1 2
+#define SDMA_AHG_APPLY_UPDATE2 3
+#define SDMA_AHG_APPLY_UPDATE3 4
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC0_FIRST_DESC_FLAG BIT_ULL(63)
+#define SDMA_DESC0_LAST_DESC_FLAG BIT_ULL(62)
+#define SDMA_DESC0_BYTE_COUNT_SHIFT 48
+#define SDMA_DESC0_BYTE_COUNT_WIDTH 14
+#define SDMA_DESC0_BYTE_COUNT_MASK \
+ ((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1)
+#define SDMA_DESC0_BYTE_COUNT_SMASK \
+ (SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT)
+#define SDMA_DESC0_PHY_ADDR_SHIFT 0
+#define SDMA_DESC0_PHY_ADDR_WIDTH 48
+#define SDMA_DESC0_PHY_ADDR_MASK \
+ ((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1)
+#define SDMA_DESC0_PHY_ADDR_SMASK \
+ (SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT)
+
+#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
+#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
+#define SDMA_DESC1_HEADER_UPDATE1_MASK \
+ ((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
+ (SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT)
+#define SDMA_DESC1_HEADER_MODE_SHIFT 13
+#define SDMA_DESC1_HEADER_MODE_WIDTH 3
+#define SDMA_DESC1_HEADER_MODE_MASK \
+ ((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_MODE_SMASK \
+ (SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT)
+#define SDMA_DESC1_HEADER_INDEX_SHIFT 8
+#define SDMA_DESC1_HEADER_INDEX_WIDTH 5
+#define SDMA_DESC1_HEADER_INDEX_MASK \
+ ((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_INDEX_SMASK \
+ (SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT)
+#define SDMA_DESC1_HEADER_DWS_SHIFT 4
+#define SDMA_DESC1_HEADER_DWS_WIDTH 4
+#define SDMA_DESC1_HEADER_DWS_MASK \
+ ((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1)
+#define SDMA_DESC1_HEADER_DWS_SMASK \
+ (SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT)
+#define SDMA_DESC1_GENERATION_SHIFT 2
+#define SDMA_DESC1_GENERATION_WIDTH 2
+#define SDMA_DESC1_GENERATION_MASK \
+ ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1)
+#define SDMA_DESC1_GENERATION_SMASK \
+ (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT)
+#define SDMA_DESC1_INT_REQ_FLAG BIT_ULL(1)
+#define SDMA_DESC1_HEAD_TO_HOST_FLAG BIT_ULL(0)
+
+enum sdma_states {
+ sdma_state_s00_hw_down,
+ sdma_state_s10_hw_start_up_halt_wait,
+ sdma_state_s15_hw_start_up_clean_wait,
+ sdma_state_s20_idle,
+ sdma_state_s30_sw_clean_up_wait,
+ sdma_state_s40_hw_clean_up_wait,
+ sdma_state_s50_hw_halt_wait,
+ sdma_state_s60_idle_halt_wait,
+ sdma_state_s80_hw_freeze,
+ sdma_state_s82_freeze_sw_clean,
+ sdma_state_s99_running,
+};
+
+enum sdma_events {
+ sdma_event_e00_go_hw_down,
+ sdma_event_e10_go_hw_start,
+ sdma_event_e15_hw_halt_done,
+ sdma_event_e25_hw_clean_up_done,
+ sdma_event_e30_go_running,
+ sdma_event_e40_sw_cleaned,
+ sdma_event_e50_hw_cleaned,
+ sdma_event_e60_hw_halted,
+ sdma_event_e70_go_idle,
+ sdma_event_e80_hw_freeze,
+ sdma_event_e81_hw_frozen,
+ sdma_event_e82_hw_unfreeze,
+ sdma_event_e85_link_down,
+ sdma_event_e90_sw_halted,
+};
+
+struct sdma_set_state_action {
+ unsigned op_enable:1;
+ unsigned op_intenable:1;
+ unsigned op_halt:1;
+ unsigned op_cleanup:1;
+ unsigned go_s99_running_tofalse:1;
+ unsigned go_s99_running_totrue:1;
+};
+
+struct sdma_state {
+ struct kref kref;
+ struct completion comp;
+ enum sdma_states current_state;
+ unsigned current_op;
+ unsigned go_s99_running;
+ /* debugging/development */
+ enum sdma_states previous_state;
+ unsigned previous_op;
+ enum sdma_events last_event;
+};
+
+/**
+ * DOC: sdma exported routines
+ *
+ * These sdma routines fit into three categories:
+ * - The SDMA API for building and submitting packets
+ * to the ring
+ *
+ * - Initialization and tear down routines to buildup
+ * and tear down SDMA
+ *
+ * - ISR entrances to handle interrupts, state changes
+ * and errors
+ */
+
+/**
+ * DOC: sdma PSM/verbs API
+ *
+ * The sdma API is designed to be used by both PSM
+ * and verbs to supply packets to the SDMA ring.
+ *
+ * The usage of the API is as follows:
+ *
+ * Embed a struct iowait in the QP or
+ * PQ. The iowait should be initialized with a
+ * call to iowait_init().
+ *
+ * The user of the API should create an allocation method
+ * for their version of the txreq. slabs, pre-allocated lists,
+ * and dma pools can be used. Once the user's overload of
+ * the sdma_txreq has been allocated, the sdma_txreq member
+ * must be initialized with sdma_txinit() or sdma_txinit_ahg().
+ *
+ * The txreq must be declared with the sdma_txreq first.
+ *
+ * The tx request, once initialized, is manipulated with calls to
+ * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
+ * for each disjoint memory location. It is the user's responsibility
+ * to understand the packet boundaries and page boundaries to do the
+ * appropriate number of sdma_txadd_* calls.. The user
+ * must be prepared to deal with failures from these routines due to
+ * either memory allocation or dma_mapping failures.
+ *
+ * The mapping specifics for each memory location are recorded
+ * in the tx. Memory locations added with sdma_txadd_page()
+ * and sdma_txadd_kvaddr() are automatically mapped when added
+ * to the tx and nmapped as part of the progress processing in the
+ * SDMA interrupt handling.
+ *
+ * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
+ * tx. An example of a use case would be a pre-allocated
+ * set of headers allocated via dma_pool_alloc() or
+ * dma_alloc_coherent(). For these memory locations, it
+ * is the responsibility of the user to handle that unmapping.
+ * (This would usually be at an unload or job termination.)
+ *
+ * The routine sdma_send_txreq() is used to submit
+ * a tx to the ring after the appropriate number of
+ * sdma_txadd_* have been done.
+ *
+ * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
+ * can be used to submit a list of packets.
+ *
+ * The user is free to use the link overhead in the struct sdma_txreq as
+ * long as the tx isn't in flight.
+ *
+ * The extreme degenerate case of the number of descriptors
+ * exceeding the ring size is automatically handled as
+ * memory locations are added. An overflow of the descriptor
+ * array that is part of the sdma_txreq is also automatically
+ * handled.
+ *
+ */
+
+/**
+ * DOC: Infrastructure calls
+ *
+ * sdma_init() is used to initialize data structures and
+ * CSRs for the desired number of SDMA engines.
+ *
+ * sdma_start() is used to kick the SDMA engines initialized
+ * with sdma_init(). Interrupts must be enabled at this
+ * point since aspects of the state machine are interrupt
+ * driven.
+ *
+ * sdma_engine_error() and sdma_engine_interrupt() are
+ * entrances for interrupts.
+ *
+ * sdma_map_init() is for the management of the mapping
+ * table when the number of vls is changed.
+ *
+ */
+
+/*
+ * struct hw_sdma_desc - raw 128 bit SDMA descriptor
+ *
+ * This is the raw descriptor in the SDMA ring
+ */
+struct hw_sdma_desc {
+ /* private: don't use directly */
+ __le64 qw[2];
+};
+
+/**
+ * struct sdma_engine - Data pertaining to each SDMA engine.
+ * @dd: a back-pointer to the device data
+ * @ppd: per port back-pointer
+ * @imask: mask for irq manipulation
+ * @idle_mask: mask for determining if an interrupt is due to sdma_idle
+ *
+ * This structure has the state for each sdma_engine.
+ *
+ * Accessing to non public fields are not supported
+ * since the private members are subject to change.
+ */
+struct sdma_engine {
+ /* read mostly */
+ struct hfi1_devdata *dd;
+ struct hfi1_pportdata *ppd;
+ /* private: */
+ void __iomem *tail_csr;
+ u64 imask; /* clear interrupt mask */
+ u64 idle_mask;
+ u64 progress_mask;
+ u64 int_mask;
+ /* private: */
+ volatile __le64 *head_dma; /* DMA'ed by chip */
+ /* private: */
+ dma_addr_t head_phys;
+ /* private: */
+ struct hw_sdma_desc *descq;
+ /* private: */
+ unsigned descq_full_count;
+ struct sdma_txreq **tx_ring;
+ /* private: */
+ dma_addr_t descq_phys;
+ /* private */
+ u32 sdma_mask;
+ /* private */
+ struct sdma_state state;
+ /* private */
+ int cpu;
+ /* private: */
+ u8 sdma_shift;
+ /* private: */
+ u8 this_idx; /* zero relative engine */
+ /* protect changes to senddmactrl shadow */
+ spinlock_t senddmactrl_lock;
+ /* private: */
+ u64 p_senddmactrl; /* shadow per-engine SendDmaCtrl */
+
+ /* read/write using tail_lock */
+ spinlock_t tail_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ /* private: */
+ u64 tail_sn;
+#endif
+ /* private: */
+ u32 descq_tail;
+ /* private: */
+ unsigned long ahg_bits;
+ /* private: */
+ u16 desc_avail;
+ /* private: */
+ u16 tx_tail;
+ /* private: */
+ u16 descq_cnt;
+
+ /* read/write using head_lock */
+ /* private: */
+ seqlock_t head_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ /* private: */
+ u64 head_sn;
+#endif
+ /* private: */
+ u32 descq_head;
+ /* private: */
+ u16 tx_head;
+ /* private: */
+ u64 last_status;
+ /* private */
+ u64 err_cnt;
+ /* private */
+ u64 sdma_int_cnt;
+ u64 idle_int_cnt;
+ u64 progress_int_cnt;
+
+ /* private: */
+ struct list_head dmawait;
+
+ /* CONFIG SDMA for now, just blindly duplicate */
+ /* private: */
+ struct tasklet_struct sdma_hw_clean_up_task
+ ____cacheline_aligned_in_smp;
+
+ /* private: */
+ struct tasklet_struct sdma_sw_clean_up_task
+ ____cacheline_aligned_in_smp;
+ /* private: */
+ struct work_struct err_halt_worker;
+ /* private */
+ struct timer_list err_progress_check_timer;
+ u32 progress_check_head;
+ /* private: */
+ struct work_struct flush_worker;
+ /* protect flush list */
+ spinlock_t flushlist_lock;
+ /* private: */
+ struct list_head flushlist;
+};
+
+int sdma_init(struct hfi1_devdata *dd, u8 port);
+void sdma_start(struct hfi1_devdata *dd);
+void sdma_exit(struct hfi1_devdata *dd);
+void sdma_all_running(struct hfi1_devdata *dd);
+void sdma_all_idle(struct hfi1_devdata *dd);
+void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
+void sdma_freeze(struct hfi1_devdata *dd);
+void sdma_unfreeze(struct hfi1_devdata *dd);
+void sdma_wait(struct hfi1_devdata *dd);
+
+/**
+ * sdma_empty() - idle engine test
+ * @engine: sdma engine
+ *
+ * Currently used by verbs as a latency optimization.
+ *
+ * Return:
+ * 1 - empty, 0 - non-empty
+ */
+static inline int sdma_empty(struct sdma_engine *sde)
+{
+ return sde->descq_tail == sde->descq_head;
+}
+
+static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
+{
+ return sde->descq_cnt -
+ (sde->descq_tail -
+ ACCESS_ONCE(sde->descq_head)) - 1;
+}
+
+static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
+{
+ return sde->descq_cnt - sdma_descq_freecnt(sde);
+}
+
+/*
+ * Either head_lock or tail lock required to see
+ * a steady state.
+ */
+static inline int __sdma_running(struct sdma_engine *engine)
+{
+ return engine->state.current_state == sdma_state_s99_running;
+}
+
+/**
+ * sdma_running() - state suitability test
+ * @engine: sdma engine
+ *
+ * sdma_running probes the internal state to determine if it is suitable
+ * for submitting packets.
+ *
+ * Return:
+ * 1 - ok to submit, 0 - not ok to submit
+ *
+ */
+static inline int sdma_running(struct sdma_engine *engine)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&engine->tail_lock, flags);
+ ret = __sdma_running(engine);
+ spin_unlock_irqrestore(&engine->tail_lock, flags);
+ return ret;
+}
+
+void _sdma_txreq_ahgadd(
+ struct sdma_txreq *tx,
+ u8 num_ahg,
+ u8 ahg_entry,
+ u32 *ahg,
+ u8 ahg_hlen);
+
+/**
+ * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @ahg_entry: ahg entry to use (0 - 31)
+ * @num_ahg: ahg descriptor for first descriptor (0 - 9)
+ * @ahg: array of AHG descriptors (up to 9 entries)
+ * @ahg_hlen: number of bytes from ASIC entry to use
+ * @cb: callback
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent. This routine must be called to initialize the user independent
+ * fields.
+ *
+ * The currently supported flags are SDMA_TXREQ_F_URGENT,
+ * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
+ * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
+ * the AHG descriptors into the first 1 to 3 descriptors.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg(). The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used. Aspects of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted. The callback will be provided this tx, a status, and a flag.
+ *
+ * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ * The flag, if the is the iowait had been used, indicates the iowait
+ * sdma_busy count has reached zero.
+ *
+ * user data portion of tlen should be precise. The sdma_txadd_* entrances
+ * will pad with a descriptor references 1 - 3 bytes when the number of bytes
+ * specified in tlen have been supplied to the sdma_txreq.
+ *
+ * ahg_hlen is used to determine the number of on-chip entry bytes to
+ * use as the header. This is for cases where the stored header is
+ * larger than the header to be used in a packet. This is typical
+ * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
+ * and RDMA_WRITE_MIDDLE.
+ *
+ */
+static inline int sdma_txinit_ahg(
+ struct sdma_txreq *tx,
+ u16 flags,
+ u16 tlen,
+ u8 ahg_entry,
+ u8 num_ahg,
+ u32 *ahg,
+ u8 ahg_hlen,
+ void (*cb)(struct sdma_txreq *, int))
+{
+ if (tlen == 0)
+ return -ENODATA;
+ if (tlen > MAX_SDMA_PKT_SIZE)
+ return -EMSGSIZE;
+ tx->desc_limit = ARRAY_SIZE(tx->descs);
+ tx->descp = &tx->descs[0];
+ INIT_LIST_HEAD(&tx->list);
+ tx->num_desc = 0;
+ tx->flags = flags;
+ tx->complete = cb;
+ tx->coalesce_buf = NULL;
+ tx->wait = NULL;
+ tx->packet_len = tlen;
+ tx->tlen = tx->packet_len;
+ tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
+ tx->descs[0].qw[1] = 0;
+ if (flags & SDMA_TXREQ_F_AHG_COPY)
+ tx->descs[0].qw[1] |=
+ (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+ << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+ (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
+ << SDMA_DESC1_HEADER_MODE_SHIFT);
+ else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
+ _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
+ return 0;
+}
+
+/**
+ * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @cb: callback pointer
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent. This routine must be called to initialize the user
+ * independent fields.
+ *
+ * The currently supported flags is SDMA_TXREQ_F_URGENT.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg(). The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used. The head size of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted.
+ *
+ * The callback, if non-NULL, will be provided this tx and a status. The
+ * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ */
+static inline int sdma_txinit(
+ struct sdma_txreq *tx,
+ u16 flags,
+ u16 tlen,
+ void (*cb)(struct sdma_txreq *, int))
+{
+ return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
+}
+
+/* helpers - don't use */
+static inline int sdma_mapping_type(struct sdma_desc *d)
+{
+ return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
+ >> SDMA_DESC1_GENERATION_SHIFT;
+}
+
+static inline size_t sdma_mapping_len(struct sdma_desc *d)
+{
+ return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
+ >> SDMA_DESC0_BYTE_COUNT_SHIFT;
+}
+
+static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
+{
+ return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
+ >> SDMA_DESC0_PHY_ADDR_SHIFT;
+}
+
+static inline void make_tx_sdma_desc(
+ struct sdma_txreq *tx,
+ int type,
+ dma_addr_t addr,
+ size_t len)
+{
+ struct sdma_desc *desc = &tx->descp[tx->num_desc];
+
+ if (!tx->num_desc) {
+ /* qw[0] zero; qw[1] first, ahg mode already in from init */
+ desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
+ << SDMA_DESC1_GENERATION_SHIFT;
+ } else {
+ desc->qw[0] = 0;
+ desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
+ << SDMA_DESC1_GENERATION_SHIFT;
+ }
+ desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
+ << SDMA_DESC0_PHY_ADDR_SHIFT) |
+ (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
+ << SDMA_DESC0_BYTE_COUNT_SHIFT);
+}
+
+/* helper to extend txreq */
+int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
+ int type, void *kvaddr, struct page *page,
+ unsigned long offset, u16 len);
+int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+
+/* helpers used by public routines */
+static inline void _sdma_close_tx(struct hfi1_devdata *dd,
+ struct sdma_txreq *tx)
+{
+ tx->descp[tx->num_desc].qw[0] |=
+ SDMA_DESC0_LAST_DESC_FLAG;
+ tx->descp[tx->num_desc].qw[1] |=
+ dd->default_desc1;
+ if (tx->flags & SDMA_TXREQ_F_URGENT)
+ tx->descp[tx->num_desc].qw[1] |=
+ (SDMA_DESC1_HEAD_TO_HOST_FLAG |
+ SDMA_DESC1_INT_REQ_FLAG);
+}
+
+static inline int _sdma_txadd_daddr(
+ struct hfi1_devdata *dd,
+ int type,
+ struct sdma_txreq *tx,
+ dma_addr_t addr,
+ u16 len)
+{
+ int rval = 0;
+
+ make_tx_sdma_desc(
+ tx,
+ type,
+ addr, len);
+ WARN_ON(len > tx->tlen);
+ tx->tlen -= len;
+ /* special cases for last */
+ if (!tx->tlen) {
+ if (tx->packet_len & (sizeof(u32) - 1)) {
+ rval = _pad_sdma_tx_descs(dd, tx);
+ if (rval)
+ return rval;
+ } else {
+ _sdma_close_tx(dd, tx);
+ }
+ }
+ tx->num_desc++;
+ return rval;
+}
+
+/**
+ * sdma_txadd_page() - add a page to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: tx request to which the page is added
+ * @page: page to map
+ * @offset: offset within the page
+ * @len: length in bytes
+ *
+ * This is used to add a page/offset/length descriptor.
+ *
+ * The mapping/unmapping of the page/offset/len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
+ * extend/coalesce descriptor array
+ */
+static inline int sdma_txadd_page(
+ struct hfi1_devdata *dd,
+ struct sdma_txreq *tx,
+ struct page *page,
+ unsigned long offset,
+ u16 len)
+{
+ dma_addr_t addr;
+ int rval;
+
+ if ((unlikely(tx->num_desc == tx->desc_limit))) {
+ rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_PAGE,
+ NULL, page, offset, len);
+ if (rval <= 0)
+ return rval;
+ }
+
+ addr = dma_map_page(
+ &dd->pcidev->dev,
+ page,
+ offset,
+ len,
+ DMA_TO_DEVICE);
+
+ if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+ sdma_txclean(dd, tx);
+ return -ENOSPC;
+ }
+
+ return _sdma_txadd_daddr(
+ dd, SDMA_MAP_PAGE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_daddr() - add a dma address to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @addr: dma address mapped by caller
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor for memory that is already dma mapped.
+ *
+ * In this case, there is no unmapping as part of the progress processing for
+ * this memory location.
+ *
+ * Return:
+ * 0 - success, -ENOMEM - couldn't extend descriptor array
+ */
+
+static inline int sdma_txadd_daddr(
+ struct hfi1_devdata *dd,
+ struct sdma_txreq *tx,
+ dma_addr_t addr,
+ u16 len)
+{
+ int rval;
+
+ if ((unlikely(tx->num_desc == tx->desc_limit))) {
+ rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_NONE,
+ NULL, NULL, 0, 0);
+ if (rval <= 0)
+ return rval;
+ }
+
+ return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @kvaddr: the kernel virtual address
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor referenced by the indicated kvaddr and
+ * len.
+ *
+ * The mapping/unmapping of the kvaddr and len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend/coalesce
+ * descriptor array
+ */
+static inline int sdma_txadd_kvaddr(
+ struct hfi1_devdata *dd,
+ struct sdma_txreq *tx,
+ void *kvaddr,
+ u16 len)
+{
+ dma_addr_t addr;
+ int rval;
+
+ if ((unlikely(tx->num_desc == tx->desc_limit))) {
+ rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_SINGLE,
+ kvaddr, NULL, 0, len);
+ if (rval <= 0)
+ return rval;
+ }
+
+ addr = dma_map_single(
+ &dd->pcidev->dev,
+ kvaddr,
+ len,
+ DMA_TO_DEVICE);
+
+ if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+ sdma_txclean(dd, tx);
+ return -ENOSPC;
+ }
+
+ return _sdma_txadd_daddr(
+ dd, SDMA_MAP_SINGLE, tx, addr, len);
+}
+
+struct iowait;
+
+int sdma_send_txreq(struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *tx);
+int sdma_send_txlist(struct sdma_engine *sde,
+ struct iowait *wait,
+ struct list_head *tx_list);
+
+int sdma_ahg_alloc(struct sdma_engine *sde);
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
+
+/**
+ * sdma_build_ahg - build ahg descriptor
+ * @data
+ * @dwindex
+ * @startbit
+ * @bits
+ *
+ * Build and return a 32 bit descriptor.
+ */
+static inline u32 sdma_build_ahg_descriptor(
+ u16 data,
+ u8 dwindex,
+ u8 startbit,
+ u8 bits)
+{
+ return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
+ ((startbit & SDMA_AHG_FIELD_START_MASK) <<
+ SDMA_AHG_FIELD_START_SHIFT) |
+ ((bits & SDMA_AHG_FIELD_LEN_MASK) <<
+ SDMA_AHG_FIELD_LEN_SHIFT) |
+ ((dwindex & SDMA_AHG_INDEX_MASK) <<
+ SDMA_AHG_INDEX_SHIFT) |
+ ((data & SDMA_AHG_VALUE_MASK) <<
+ SDMA_AHG_VALUE_SHIFT));
+}
+
+/**
+ * sdma_progress - use seq number of detect head progress
+ * @sde: sdma_engine to check
+ * @seq: base seq count
+ * @tx: txreq for which we need to check descriptor availability
+ *
+ * This is used in the appropriate spot in the sleep routine
+ * to check for potential ring progress. This routine gets the
+ * seqcount before queuing the iowait structure for progress.
+ *
+ * If the seqcount indicates that progress needs to be checked,
+ * re-submission is detected by checking whether the descriptor
+ * queue has enough descriptor for the txreq.
+ */
+static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
+ struct sdma_txreq *tx)
+{
+ if (read_seqretry(&sde->head_lock, seq)) {
+ sde->desc_avail = sdma_descq_freecnt(sde);
+ if (tx->num_desc > sde->desc_avail)
+ return 0;
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * sdma_iowait_schedule() - initialize wait structure
+ * @sde: sdma_engine to schedule
+ * @wait: wait struct to schedule
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+static inline void sdma_iowait_schedule(
+ struct sdma_engine *sde,
+ struct iowait *wait)
+{
+ struct hfi1_pportdata *ppd = sde->dd->pport;
+
+ iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
+}
+
+/* for use by interrupt handling */
+void sdma_engine_error(struct sdma_engine *sde, u64 status);
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
+
+/*
+ *
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform engines per vl, the
+ * number of engines for a vl is either the vl_engines[vl] or
+ * a computation based on num_sdma/num_vls:
+ *
+ * For example:
+ * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_sdma/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the engines are assigned
+ * in a round robin fashion wrapping back to the first engine
+ * for a particular vl.
+ *
+ * dd->sdma_map
+ * | sdma_map_elem[0]
+ * | +--------------------+
+ * v | mask |
+ * sdma_vl_map |--------------------|
+ * +--------------------------+ | sde[0] -> eng 1 |
+ * | list (RCU) | |--------------------|
+ * |--------------------------| ->| sde[1] -> eng 2 |
+ * | mask | --/ |--------------------|
+ * |--------------------------| -/ | * |
+ * | actual_vls (max 8) | -/ |--------------------|
+ * |--------------------------| --/ | sde[n] -> eng n |
+ * | vls (max 8) | -/ +--------------------+
+ * |--------------------------| --/
+ * | map[0] |-/
+ * |--------------------------| +--------------------+
+ * | map[1] |--- | mask |
+ * |--------------------------| \---- |--------------------|
+ * | * | \-- | sde[0] -> eng 1+n |
+ * | * | \---- |--------------------|
+ * | * | \->| sde[1] -> eng 2+n |
+ * |--------------------------| |--------------------|
+ * | map[vls - 1] |- | * |
+ * +--------------------------+ \- |--------------------|
+ * \- | sde[m] -> eng m+n |
+ * \ +--------------------+
+ * \-
+ * \
+ * \- +--------------------+
+ * \- | mask |
+ * \ |--------------------|
+ * \- | sde[0] -> eng 1+m+n|
+ * \- |--------------------|
+ * >| sde[1] -> eng 2+m+n|
+ * |--------------------|
+ * | * |
+ * |--------------------|
+ * | sde[o] -> eng o+m+n|
+ * +--------------------+
+ *
+ */
+
+/**
+ * struct sdma_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @sde - array of engines for this vl
+ *
+ * The mask is used to "mod" the selector
+ * to produce index into the trailing
+ * array of sdes.
+ */
+struct sdma_map_elem {
+ u32 mask;
+ struct sdma_engine *sde[0];
+};
+
+/**
+ * struct sdma_map_el - mapping for a vl
+ * @engine_to_vl - map of an engine to a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - number of vls rounded to next power of 2
+ * @map - array of sdma_map_elem entries
+ *
+ * This is the parent mapping structure. The trailing
+ * members of the struct point to sdma_map_elem entries, which
+ * in turn point to an array of sde's for that vl.
+ */
+struct sdma_vl_map {
+ s8 engine_to_vl[TXE_NUM_SDMA_ENGINES];
+ struct rcu_head list;
+ u32 mask;
+ u8 actual_vls;
+ u8 vls;
+ struct sdma_map_elem *map[0];
+};
+
+int sdma_map_init(
+ struct hfi1_devdata *dd,
+ u8 port,
+ u8 num_vls,
+ u8 *vl_engines);
+
+/* slow path */
+void _sdma_engine_progress_schedule(struct sdma_engine *sde);
+
+/**
+ * sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ * This is the fast path.
+ *
+ */
+static inline void sdma_engine_progress_schedule(
+ struct sdma_engine *sde)
+{
+ if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
+ return;
+ _sdma_engine_progress_schedule(sde);
+}
+
+struct sdma_engine *sdma_select_engine_sc(
+ struct hfi1_devdata *dd,
+ u32 selector,
+ u8 sc5);
+
+struct sdma_engine *sdma_select_engine_vl(
+ struct hfi1_devdata *dd,
+ u32 selector,
+ u8 vl);
+
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+void sdma_dumpstate(struct sdma_engine *);
+#endif
+static inline char *slashstrip(char *s)
+{
+ char *r = s;
+
+ while (*s)
+ if (*s++ == '/')
+ r = s;
+ return r;
+}
+
+u16 sdma_get_descq_cnt(void);
+
+extern uint mod_num_sdma;
+
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
new file mode 100644
index 000000000000..bf7d777d756e
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_SDMA_TXREQ_H
+#define HFI1_SDMA_TXREQ_H
+
+/* increased for AHG */
+#define NUM_DESC 6
+
+/*
+ * struct sdma_desc - canonical fragment descriptor
+ *
+ * This is the descriptor carried in the tx request
+ * corresponding to each fragment.
+ *
+ */
+struct sdma_desc {
+ /* private: don't use directly */
+ u64 qw[2];
+};
+
+/**
+ * struct sdma_txreq - the sdma_txreq structure (one per packet)
+ * @list: for use by user and by queuing for wait
+ *
+ * This is the representation of a packet which consists of some
+ * number of fragments. Storage is provided to within the structure.
+ * for all fragments.
+ *
+ * The storage for the descriptors are automatically extended as needed
+ * when the currently allocation is exceeded.
+ *
+ * The user (Verbs or PSM) may overload this structure with fields
+ * specific to their use by putting this struct first in their struct.
+ * The method of allocation of the overloaded structure is user dependent
+ *
+ * The list is the only public field in the structure.
+ *
+ */
+
+#define SDMA_TXREQ_S_OK 0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED 2
+#define SDMA_TXREQ_S_SHUTDOWN 3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT 0x0001
+#define SDMA_TXREQ_F_AHG_COPY 0x0002
+#define SDMA_TXREQ_F_USE_AHG 0x0004
+
+struct sdma_txreq;
+typedef void (*callback_t)(struct sdma_txreq *, int);
+
+struct iowait;
+struct sdma_txreq {
+ struct list_head list;
+ /* private: */
+ struct sdma_desc *descp;
+ /* private: */
+ void *coalesce_buf;
+ /* private: */
+ struct iowait *wait;
+ /* private: */
+ callback_t complete;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+ u64 sn;
+#endif
+ /* private: - used in coalesce/pad processing */
+ u16 packet_len;
+ /* private: - down-counted to trigger last */
+ u16 tlen;
+ /* private: */
+ u16 num_desc;
+ /* private: */
+ u16 desc_limit;
+ /* private: */
+ u16 next_descq_idx;
+ /* private: */
+ u16 coalesce_idx;
+ /* private: flags */
+ u16 flags;
+ /* private: */
+ struct sdma_desc descs[NUM_DESC];
+};
+
+static inline int sdma_txreq_built(struct sdma_txreq *tx)
+{
+ return tx->num_desc;
+}
+
+#endif /* HFI1_SDMA_TXREQ_H */
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
new file mode 100644
index 000000000000..74c84c655f7e
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -0,0 +1,810 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/ctype.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+#include "affinity.h"
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ int ret;
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+ struct cc_state *cc_state;
+
+ ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+ + sizeof(__be16);
+
+ if (pos > ret)
+ return -EINVAL;
+
+ if (count > ret - pos)
+ count = ret - pos;
+
+ if (!count)
+ return count;
+
+ rcu_read_lock();
+ cc_state = get_cc_state(ppd);
+ if (!cc_state) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ memcpy(buf, (void *)&cc_state->cct + pos, count);
+ rcu_read_unlock();
+
+ return count;
+}
+
+static void port_release(struct kobject *kobj)
+{
+ /* nothing to do since memory is freed by hfi1_free_devdata() */
+}
+
+static struct bin_attribute cc_table_bin_attr = {
+ .attr = {.name = "cc_table_bin", .mode = 0444},
+ .read = read_cc_table_bin,
+ .size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t pos, size_t count)
+{
+ int ret;
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+ struct cc_state *cc_state;
+
+ ret = sizeof(struct opa_congestion_setting_attr_shadow);
+
+ if (pos > ret)
+ return -EINVAL;
+ if (count > ret - pos)
+ count = ret - pos;
+
+ if (!count)
+ return count;
+
+ rcu_read_lock();
+ cc_state = get_cc_state(ppd);
+ if (!cc_state) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ memcpy(buf, (void *)&cc_state->cong_setting + pos, count);
+ rcu_read_unlock();
+
+ return count;
+}
+
+static struct bin_attribute cc_setting_bin_attr = {
+ .attr = {.name = "cc_settings_bin", .mode = 0444},
+ .read = read_cc_setting_bin,
+ .size = PAGE_SIZE,
+};
+
+struct hfi1_port_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct hfi1_pportdata *, char *);
+ ssize_t (*store)(struct hfi1_pportdata *, const char *, size_t);
+};
+
+static ssize_t cc_prescan_show(struct hfi1_pportdata *ppd, char *buf)
+{
+ return sprintf(buf, "%s\n", ppd->cc_prescan ? "on" : "off");
+}
+
+static ssize_t cc_prescan_store(struct hfi1_pportdata *ppd, const char *buf,
+ size_t count)
+{
+ if (!memcmp(buf, "on", 2))
+ ppd->cc_prescan = true;
+ else if (!memcmp(buf, "off", 3))
+ ppd->cc_prescan = false;
+
+ return count;
+}
+
+static struct hfi1_port_attr cc_prescan_attr =
+ __ATTR(cc_prescan, 0600, cc_prescan_show, cc_prescan_store);
+
+static ssize_t cc_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct hfi1_port_attr *port_attr =
+ container_of(attr, struct hfi1_port_attr, attr);
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+
+ return port_attr->show(ppd, buf);
+}
+
+static ssize_t cc_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hfi1_port_attr *port_attr =
+ container_of(attr, struct hfi1_port_attr, attr);
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+
+ return port_attr->store(ppd, buf, count);
+}
+
+static const struct sysfs_ops port_cc_sysfs_ops = {
+ .show = cc_attr_show,
+ .store = cc_attr_store
+};
+
+static struct attribute *port_cc_default_attributes[] = {
+ &cc_prescan_attr.attr
+};
+
+static struct kobj_type port_cc_ktype = {
+ .release = port_release,
+ .sysfs_ops = &port_cc_sysfs_ops,
+ .default_attrs = port_cc_default_attributes
+};
+
+/* Start sc2vl */
+#define HFI1_SC2VL_ATTR(N) \
+ static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
+ .attr = { .name = __stringify(N), .mode = 0444 }, \
+ .sc = N \
+ }
+
+struct hfi1_sc2vl_attr {
+ struct attribute attr;
+ int sc;
+};
+
+HFI1_SC2VL_ATTR(0);
+HFI1_SC2VL_ATTR(1);
+HFI1_SC2VL_ATTR(2);
+HFI1_SC2VL_ATTR(3);
+HFI1_SC2VL_ATTR(4);
+HFI1_SC2VL_ATTR(5);
+HFI1_SC2VL_ATTR(6);
+HFI1_SC2VL_ATTR(7);
+HFI1_SC2VL_ATTR(8);
+HFI1_SC2VL_ATTR(9);
+HFI1_SC2VL_ATTR(10);
+HFI1_SC2VL_ATTR(11);
+HFI1_SC2VL_ATTR(12);
+HFI1_SC2VL_ATTR(13);
+HFI1_SC2VL_ATTR(14);
+HFI1_SC2VL_ATTR(15);
+HFI1_SC2VL_ATTR(16);
+HFI1_SC2VL_ATTR(17);
+HFI1_SC2VL_ATTR(18);
+HFI1_SC2VL_ATTR(19);
+HFI1_SC2VL_ATTR(20);
+HFI1_SC2VL_ATTR(21);
+HFI1_SC2VL_ATTR(22);
+HFI1_SC2VL_ATTR(23);
+HFI1_SC2VL_ATTR(24);
+HFI1_SC2VL_ATTR(25);
+HFI1_SC2VL_ATTR(26);
+HFI1_SC2VL_ATTR(27);
+HFI1_SC2VL_ATTR(28);
+HFI1_SC2VL_ATTR(29);
+HFI1_SC2VL_ATTR(30);
+HFI1_SC2VL_ATTR(31);
+
+static struct attribute *sc2vl_default_attributes[] = {
+ &hfi1_sc2vl_attr_0.attr,
+ &hfi1_sc2vl_attr_1.attr,
+ &hfi1_sc2vl_attr_2.attr,
+ &hfi1_sc2vl_attr_3.attr,
+ &hfi1_sc2vl_attr_4.attr,
+ &hfi1_sc2vl_attr_5.attr,
+ &hfi1_sc2vl_attr_6.attr,
+ &hfi1_sc2vl_attr_7.attr,
+ &hfi1_sc2vl_attr_8.attr,
+ &hfi1_sc2vl_attr_9.attr,
+ &hfi1_sc2vl_attr_10.attr,
+ &hfi1_sc2vl_attr_11.attr,
+ &hfi1_sc2vl_attr_12.attr,
+ &hfi1_sc2vl_attr_13.attr,
+ &hfi1_sc2vl_attr_14.attr,
+ &hfi1_sc2vl_attr_15.attr,
+ &hfi1_sc2vl_attr_16.attr,
+ &hfi1_sc2vl_attr_17.attr,
+ &hfi1_sc2vl_attr_18.attr,
+ &hfi1_sc2vl_attr_19.attr,
+ &hfi1_sc2vl_attr_20.attr,
+ &hfi1_sc2vl_attr_21.attr,
+ &hfi1_sc2vl_attr_22.attr,
+ &hfi1_sc2vl_attr_23.attr,
+ &hfi1_sc2vl_attr_24.attr,
+ &hfi1_sc2vl_attr_25.attr,
+ &hfi1_sc2vl_attr_26.attr,
+ &hfi1_sc2vl_attr_27.attr,
+ &hfi1_sc2vl_attr_28.attr,
+ &hfi1_sc2vl_attr_29.attr,
+ &hfi1_sc2vl_attr_30.attr,
+ &hfi1_sc2vl_attr_31.attr,
+ NULL
+};
+
+static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct hfi1_sc2vl_attr *sattr =
+ container_of(attr, struct hfi1_sc2vl_attr, attr);
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
+ struct hfi1_devdata *dd = ppd->dd;
+
+ return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
+}
+
+static const struct sysfs_ops hfi1_sc2vl_ops = {
+ .show = sc2vl_attr_show,
+};
+
+static struct kobj_type hfi1_sc2vl_ktype = {
+ .release = port_release,
+ .sysfs_ops = &hfi1_sc2vl_ops,
+ .default_attrs = sc2vl_default_attributes
+};
+
+/* End sc2vl */
+
+/* Start sl2sc */
+#define HFI1_SL2SC_ATTR(N) \
+ static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = { \
+ .attr = { .name = __stringify(N), .mode = 0444 }, \
+ .sl = N \
+ }
+
+struct hfi1_sl2sc_attr {
+ struct attribute attr;
+ int sl;
+};
+
+HFI1_SL2SC_ATTR(0);
+HFI1_SL2SC_ATTR(1);
+HFI1_SL2SC_ATTR(2);
+HFI1_SL2SC_ATTR(3);
+HFI1_SL2SC_ATTR(4);
+HFI1_SL2SC_ATTR(5);
+HFI1_SL2SC_ATTR(6);
+HFI1_SL2SC_ATTR(7);
+HFI1_SL2SC_ATTR(8);
+HFI1_SL2SC_ATTR(9);
+HFI1_SL2SC_ATTR(10);
+HFI1_SL2SC_ATTR(11);
+HFI1_SL2SC_ATTR(12);
+HFI1_SL2SC_ATTR(13);
+HFI1_SL2SC_ATTR(14);
+HFI1_SL2SC_ATTR(15);
+HFI1_SL2SC_ATTR(16);
+HFI1_SL2SC_ATTR(17);
+HFI1_SL2SC_ATTR(18);
+HFI1_SL2SC_ATTR(19);
+HFI1_SL2SC_ATTR(20);
+HFI1_SL2SC_ATTR(21);
+HFI1_SL2SC_ATTR(22);
+HFI1_SL2SC_ATTR(23);
+HFI1_SL2SC_ATTR(24);
+HFI1_SL2SC_ATTR(25);
+HFI1_SL2SC_ATTR(26);
+HFI1_SL2SC_ATTR(27);
+HFI1_SL2SC_ATTR(28);
+HFI1_SL2SC_ATTR(29);
+HFI1_SL2SC_ATTR(30);
+HFI1_SL2SC_ATTR(31);
+
+static struct attribute *sl2sc_default_attributes[] = {
+ &hfi1_sl2sc_attr_0.attr,
+ &hfi1_sl2sc_attr_1.attr,
+ &hfi1_sl2sc_attr_2.attr,
+ &hfi1_sl2sc_attr_3.attr,
+ &hfi1_sl2sc_attr_4.attr,
+ &hfi1_sl2sc_attr_5.attr,
+ &hfi1_sl2sc_attr_6.attr,
+ &hfi1_sl2sc_attr_7.attr,
+ &hfi1_sl2sc_attr_8.attr,
+ &hfi1_sl2sc_attr_9.attr,
+ &hfi1_sl2sc_attr_10.attr,
+ &hfi1_sl2sc_attr_11.attr,
+ &hfi1_sl2sc_attr_12.attr,
+ &hfi1_sl2sc_attr_13.attr,
+ &hfi1_sl2sc_attr_14.attr,
+ &hfi1_sl2sc_attr_15.attr,
+ &hfi1_sl2sc_attr_16.attr,
+ &hfi1_sl2sc_attr_17.attr,
+ &hfi1_sl2sc_attr_18.attr,
+ &hfi1_sl2sc_attr_19.attr,
+ &hfi1_sl2sc_attr_20.attr,
+ &hfi1_sl2sc_attr_21.attr,
+ &hfi1_sl2sc_attr_22.attr,
+ &hfi1_sl2sc_attr_23.attr,
+ &hfi1_sl2sc_attr_24.attr,
+ &hfi1_sl2sc_attr_25.attr,
+ &hfi1_sl2sc_attr_26.attr,
+ &hfi1_sl2sc_attr_27.attr,
+ &hfi1_sl2sc_attr_28.attr,
+ &hfi1_sl2sc_attr_29.attr,
+ &hfi1_sl2sc_attr_30.attr,
+ &hfi1_sl2sc_attr_31.attr,
+ NULL
+};
+
+static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct hfi1_sl2sc_attr *sattr =
+ container_of(attr, struct hfi1_sl2sc_attr, attr);
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+ return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
+}
+
+static const struct sysfs_ops hfi1_sl2sc_ops = {
+ .show = sl2sc_attr_show,
+};
+
+static struct kobj_type hfi1_sl2sc_ktype = {
+ .release = port_release,
+ .sysfs_ops = &hfi1_sl2sc_ops,
+ .default_attrs = sl2sc_default_attributes
+};
+
+/* End sl2sc */
+
+/* Start vl2mtu */
+
+#define HFI1_VL2MTU_ATTR(N) \
+ static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
+ .attr = { .name = __stringify(N), .mode = 0444 }, \
+ .vl = N \
+ }
+
+struct hfi1_vl2mtu_attr {
+ struct attribute attr;
+ int vl;
+};
+
+HFI1_VL2MTU_ATTR(0);
+HFI1_VL2MTU_ATTR(1);
+HFI1_VL2MTU_ATTR(2);
+HFI1_VL2MTU_ATTR(3);
+HFI1_VL2MTU_ATTR(4);
+HFI1_VL2MTU_ATTR(5);
+HFI1_VL2MTU_ATTR(6);
+HFI1_VL2MTU_ATTR(7);
+HFI1_VL2MTU_ATTR(8);
+HFI1_VL2MTU_ATTR(9);
+HFI1_VL2MTU_ATTR(10);
+HFI1_VL2MTU_ATTR(11);
+HFI1_VL2MTU_ATTR(12);
+HFI1_VL2MTU_ATTR(13);
+HFI1_VL2MTU_ATTR(14);
+HFI1_VL2MTU_ATTR(15);
+
+static struct attribute *vl2mtu_default_attributes[] = {
+ &hfi1_vl2mtu_attr_0.attr,
+ &hfi1_vl2mtu_attr_1.attr,
+ &hfi1_vl2mtu_attr_2.attr,
+ &hfi1_vl2mtu_attr_3.attr,
+ &hfi1_vl2mtu_attr_4.attr,
+ &hfi1_vl2mtu_attr_5.attr,
+ &hfi1_vl2mtu_attr_6.attr,
+ &hfi1_vl2mtu_attr_7.attr,
+ &hfi1_vl2mtu_attr_8.attr,
+ &hfi1_vl2mtu_attr_9.attr,
+ &hfi1_vl2mtu_attr_10.attr,
+ &hfi1_vl2mtu_attr_11.attr,
+ &hfi1_vl2mtu_attr_12.attr,
+ &hfi1_vl2mtu_attr_13.attr,
+ &hfi1_vl2mtu_attr_14.attr,
+ &hfi1_vl2mtu_attr_15.attr,
+ NULL
+};
+
+static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct hfi1_vl2mtu_attr *vlattr =
+ container_of(attr, struct hfi1_vl2mtu_attr, attr);
+ struct hfi1_pportdata *ppd =
+ container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
+ struct hfi1_devdata *dd = ppd->dd;
+
+ return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
+}
+
+static const struct sysfs_ops hfi1_vl2mtu_ops = {
+ .show = vl2mtu_attr_show,
+};
+
+static struct kobj_type hfi1_vl2mtu_ktype = {
+ .release = port_release,
+ .sysfs_ops = &hfi1_vl2mtu_ops,
+ .default_attrs = vl2mtu_default_attributes
+};
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+
+ return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ int ret;
+
+ if (!dd->boardname)
+ ret = -EINVAL;
+ else
+ ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+ return ret;
+}
+
+static ssize_t show_boardversion(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ /* The string printed here is already newline-terminated. */
+ return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+static ssize_t show_nctxts(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ /*
+ * Return the smaller of send and receive contexts.
+ * Normally, user level applications would require both a send
+ * and a receive context, so returning the smaller of the two counts
+ * give a more accurate picture of total contexts available.
+ */
+ return scnprintf(buf, PAGE_SIZE, "%u\n",
+ min(dd->num_rcv_contexts - dd->first_user_ctxt,
+ (u32)dd->sc_sizes[SC_USER].count));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ /* Return the number of free user ports (contexts) available. */
+ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
+}
+
+static ssize_t store_chip_reset(struct device *device,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ int ret;
+
+ if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
+ ret = hfi1_reset_device(dd->unit);
+bail:
+ return ret < 0 ? ret : count;
+}
+
+/*
+ * Convert the reported temperature from an integer (reported in
+ * units of 0.25C) to a floating point number.
+ */
+#define temp2str(temp, buf, size, idx) \
+ scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ", \
+ ((temp) >> 2), ((temp) & 0x3) * 25)
+
+/*
+ * Dump tempsense values, in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+ struct hfi1_temp temp;
+ int ret;
+
+ ret = hfi1_tempsense_rd(dd, &temp);
+ if (!ret) {
+ int idx = 0;
+
+ idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
+ idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
+ idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
+ idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
+ idx += scnprintf(buf + idx, PAGE_SIZE - idx,
+ "%u %u %u\n", temp.triggers & 0x1,
+ temp.triggers & 0x2, temp.triggers & 0x4);
+ ret = idx;
+ }
+ return ret;
+}
+
+static ssize_t show_sdma_affinity(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ return hfi1_get_sdma_affinity(dd, buf);
+}
+
+static ssize_t store_sdma_affinity(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hfi1_ibdev *dev =
+ container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ return hfi1_set_sdma_affinity(dd, buf, count);
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+static DEVICE_ATTR(sdma_affinity, S_IWUSR | S_IRUGO, show_sdma_affinity,
+ store_sdma_affinity);
+
+static struct device_attribute *hfi1_attributes[] = {
+ &dev_attr_hw_rev,
+ &dev_attr_board_id,
+ &dev_attr_nctxts,
+ &dev_attr_nfreectxts,
+ &dev_attr_serial,
+ &dev_attr_boardversion,
+ &dev_attr_tempsense,
+ &dev_attr_chip_reset,
+ &dev_attr_sdma_affinity,
+};
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+ struct kobject *kobj)
+{
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+ int ret;
+
+ if (!port_num || port_num > dd->num_pports) {
+ dd_dev_err(dd,
+ "Skipping infiniband class with invalid port %u\n",
+ port_num);
+ return -ENODEV;
+ }
+ ppd = &dd->pport[port_num - 1];
+
+ ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
+ "sc2vl");
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping sc2vl sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail;
+ }
+ kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
+
+ ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
+ "sl2sc");
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping sl2sc sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_sc2vl;
+ }
+ kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
+
+ ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
+ "vl2mtu");
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping vl2mtu sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_sl2sc;
+ }
+ kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
+
+ ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
+ kobj, "CCMgtA");
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_vl2mtu;
+ }
+
+ kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+ ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr);
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_cc;
+ }
+
+ ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_table_bin_attr);
+ if (ret) {
+ dd_dev_err(dd,
+ "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+ ret, port_num);
+ goto bail_cc_entry_bin;
+ }
+
+ dd_dev_info(dd,
+ "Congestion Control Agent enabled for port %d\n",
+ port_num);
+
+ return 0;
+
+bail_cc_entry_bin:
+ sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+ &cc_setting_bin_attr);
+bail_cc:
+ kobject_put(&ppd->pport_cc_kobj);
+bail_vl2mtu:
+ kobject_put(&ppd->vl2mtu_kobj);
+bail_sl2sc:
+ kobject_put(&ppd->sl2sc_kobj);
+bail_sc2vl:
+ kobject_put(&ppd->sc2vl_kobj);
+bail:
+ return ret;
+}
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
+{
+ struct ib_device *dev = &dd->verbs_dev.rdi.ibdev;
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
+ ret = device_create_file(&dev->dev, hfi1_attributes[i]);
+ if (ret)
+ goto bail;
+ }
+
+ return 0;
+bail:
+ for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
+ device_remove_file(&dev->dev, hfi1_attributes[i]);
+ return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
+{
+ struct hfi1_pportdata *ppd;
+ int i;
+
+ for (i = 0; i < dd->num_pports; i++) {
+ ppd = &dd->pport[i];
+
+ sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+ &cc_setting_bin_attr);
+ sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+ &cc_table_bin_attr);
+ kobject_put(&ppd->pport_cc_kobj);
+ kobject_put(&ppd->vl2mtu_kobj);
+ kobject_put(&ppd->sl2sc_kobj);
+ kobject_put(&ppd->sc2vl_kobj);
+ }
+}
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
new file mode 100644
index 000000000000..4cfb13771897
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr)
+{
+ struct hfi1_other_headers *ohdr;
+ u8 opcode;
+ u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+
+ if (lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else
+ ohdr = &hdr->u.l.oth;
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+ return hdr_len_by_opcode[opcode] == 0 ?
+ 0 : hdr_len_by_opcode[opcode] - (12 + 8);
+}
+
+#define IMM_PRN "imm %d"
+#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
+#define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x"
+#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
+#define IETH_PRN "ieth rkey 0x%.8x"
+#define ATOMICACKETH_PRN "origdata %lld"
+#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
+
+#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
+
+static u64 ib_u64_get(__be32 *p)
+{
+ return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]);
+}
+
+static const char *parse_syndrome(u8 syndrome)
+{
+ switch (syndrome >> 5) {
+ case 0:
+ return "ACK";
+ case 1:
+ return "RNRNAK";
+ case 3:
+ return "NAK";
+ }
+ return "";
+}
+
+const char *parse_everbs_hdrs(
+ struct trace_seq *p,
+ u8 opcode,
+ void *ehdrs)
+{
+ union ib_ehdrs *eh = ehdrs;
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ switch (opcode) {
+ /* imm */
+ case OP(RC, SEND_LAST_WITH_IMMEDIATE):
+ case OP(UC, SEND_LAST_WITH_IMMEDIATE):
+ case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
+ case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
+ case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ trace_seq_printf(p, IMM_PRN,
+ be32_to_cpu(eh->imm_data));
+ break;
+ /* reth + imm */
+ case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ trace_seq_printf(p, RETH_PRN " " IMM_PRN,
+ (unsigned long long)ib_u64_get(
+ (__be32 *)&eh->rc.reth.vaddr),
+ be32_to_cpu(eh->rc.reth.rkey),
+ be32_to_cpu(eh->rc.reth.length),
+ be32_to_cpu(eh->rc.imm_data));
+ break;
+ /* reth */
+ case OP(RC, RDMA_READ_REQUEST):
+ case OP(RC, RDMA_WRITE_FIRST):
+ case OP(UC, RDMA_WRITE_FIRST):
+ case OP(RC, RDMA_WRITE_ONLY):
+ case OP(UC, RDMA_WRITE_ONLY):
+ trace_seq_printf(p, RETH_PRN,
+ (unsigned long long)ib_u64_get(
+ (__be32 *)&eh->rc.reth.vaddr),
+ be32_to_cpu(eh->rc.reth.rkey),
+ be32_to_cpu(eh->rc.reth.length));
+ break;
+ case OP(RC, RDMA_READ_RESPONSE_FIRST):
+ case OP(RC, RDMA_READ_RESPONSE_LAST):
+ case OP(RC, RDMA_READ_RESPONSE_ONLY):
+ case OP(RC, ACKNOWLEDGE):
+ trace_seq_printf(p, AETH_PRN, be32_to_cpu(eh->aeth) >> 24,
+ parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
+ be32_to_cpu(eh->aeth) & HFI1_MSN_MASK);
+ break;
+ /* aeth + atomicacketh */
+ case OP(RC, ATOMIC_ACKNOWLEDGE):
+ trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
+ be32_to_cpu(eh->at.aeth) >> 24,
+ parse_syndrome(be32_to_cpu(eh->at.aeth) >> 24),
+ be32_to_cpu(eh->at.aeth) & HFI1_MSN_MASK,
+ (unsigned long long)
+ ib_u64_get(eh->at.atomic_ack_eth));
+ break;
+ /* atomiceth */
+ case OP(RC, COMPARE_SWAP):
+ case OP(RC, FETCH_ADD):
+ trace_seq_printf(p, ATOMICETH_PRN,
+ (unsigned long long)ib_u64_get(
+ eh->atomic_eth.vaddr),
+ eh->atomic_eth.rkey,
+ (unsigned long long)ib_u64_get(
+ (__be32 *)&eh->atomic_eth.swap_data),
+ (unsigned long long)ib_u64_get(
+ (__be32 *)&eh->atomic_eth.compare_data));
+ break;
+ /* deth */
+ case OP(UD, SEND_ONLY):
+ case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
+ trace_seq_printf(p, DETH_PRN,
+ be32_to_cpu(eh->ud.deth[0]),
+ be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK);
+ break;
+ /* ieth */
+ case OP(RC, SEND_LAST_WITH_INVALIDATE):
+ case OP(RC, SEND_ONLY_WITH_INVALIDATE):
+ trace_seq_printf(p, IETH_PRN,
+ be32_to_cpu(eh->ieth));
+ break;
+ }
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+const char *parse_sdma_flags(
+ struct trace_seq *p,
+ u64 desc0, u64 desc1)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ char flags[5] = { 'x', 'x', 'x', 'x', 0 };
+
+ flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+ flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? 'H' : '-';
+ flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+ flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+ trace_seq_printf(p, "%s", flags);
+ if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
+ trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
+ (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) &
+ SDMA_DESC1_HEADER_MODE_MASK),
+ (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) &
+ SDMA_DESC1_HEADER_INDEX_MASK),
+ (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) &
+ SDMA_DESC1_HEADER_DWS_MASK));
+ return ret;
+}
+
+const char *print_u32_array(
+ struct trace_seq *p,
+ u32 *arr, int len)
+{
+ int i;
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ for (i = 0; i < len ; i++)
+ trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+__hfi1_trace_fn(PKT);
+__hfi1_trace_fn(PROC);
+__hfi1_trace_fn(SDMA);
+__hfi1_trace_fn(LINKVERB);
+__hfi1_trace_fn(DEBUG);
+__hfi1_trace_fn(SNOOP);
+__hfi1_trace_fn(CNTR);
+__hfi1_trace_fn(PIO);
+__hfi1_trace_fn(DC8051);
+__hfi1_trace_fn(FIRMWARE);
+__hfi1_trace_fn(RCVCTRL);
+__hfi1_trace_fn(TID);
+__hfi1_trace_fn(MMU);
+__hfi1_trace_fn(IOCTL);
diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h
new file mode 100644
index 000000000000..92dc88f013c9
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include "trace_dbg.h"
+#include "trace_misc.h"
+#include "trace_ctxts.h"
+#include "trace_ibhdrs.h"
+#include "trace_rc.h"
+#include "trace_rx.h"
+#include "trace_tx.h"
diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h
new file mode 100644
index 000000000000..31654bbac1cf
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h
@@ -0,0 +1,141 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license. When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* - Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+* - Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in
+* the documentation and/or other materials provided with the
+* distribution.
+* - Neither the name of Intel Corporation nor the names of its
+* contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_CTXTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_CTXTS_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ctxts
+
+#define UCTXT_FMT \
+ "cred:%u, credaddr:0x%llx, piobase:0x%p, rcvhdr_cnt:%u, " \
+ "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
+TRACE_EVENT(hfi1_uctxtdata,
+ TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
+ TP_ARGS(dd, uctxt),
+ TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+ __field(unsigned int, ctxt)
+ __field(u32, credits)
+ __field(u64, hw_free)
+ __field(void __iomem *, piobase)
+ __field(u16, rcvhdrq_cnt)
+ __field(u64, rcvhdrq_phys)
+ __field(u32, eager_cnt)
+ __field(u64, rcvegr_phys)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(dd);
+ __entry->ctxt = uctxt->ctxt;
+ __entry->credits = uctxt->sc->credits;
+ __entry->hw_free = le64_to_cpu(*uctxt->sc->hw_free);
+ __entry->piobase = uctxt->sc->base_addr;
+ __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+ __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
+ __entry->eager_cnt = uctxt->egrbufs.alloced;
+ __entry->rcvegr_phys =
+ uctxt->egrbufs.rcvtids[0].phys;
+ ),
+ TP_printk("[%s] ctxt %u " UCTXT_FMT,
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->credits,
+ __entry->hw_free,
+ __entry->piobase,
+ __entry->rcvhdrq_cnt,
+ __entry->rcvhdrq_phys,
+ __entry->eager_cnt,
+ __entry->rcvegr_phys
+ )
+);
+
+#define CINFO_FMT \
+ "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
+TRACE_EVENT(hfi1_ctxt_info,
+ TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt,
+ unsigned int subctxt,
+ struct hfi1_ctxt_info cinfo),
+ TP_ARGS(dd, ctxt, subctxt, cinfo),
+ TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+ __field(unsigned int, ctxt)
+ __field(unsigned int, subctxt)
+ __field(u16, egrtids)
+ __field(u16, rcvhdrq_cnt)
+ __field(u16, rcvhdrq_size)
+ __field(u16, sdma_ring_size)
+ __field(u32, rcvegr_size)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->egrtids = cinfo.egrtids;
+ __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
+ __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
+ __entry->sdma_ring_size = cinfo.sdma_ring_size;
+ __entry->rcvegr_size = cinfo.rcvegr_size;
+ ),
+ TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->egrtids,
+ __entry->rcvegr_size,
+ __entry->rcvhdrq_cnt,
+ __entry->rcvhdrq_size,
+ __entry->sdma_ring_size
+ )
+);
+
+#endif /* __HFI1_TRACE_CTXTS_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_ctxts
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
new file mode 100644
index 000000000000..0e7d929530c5
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -0,0 +1,155 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license. When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* - Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+* - Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in
+* the documentation and/or other materials provided with the
+* distribution.
+* - Neither the name of Intel Corporation nor the names of its
+* contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_EXTRA_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_EXTRA_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+/*
+ * Note:
+ * This produces a REALLY ugly trace in the console output when the string is
+ * too long.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_dbg
+
+#define MAX_MSG_LEN 512
+
+DECLARE_EVENT_CLASS(hfi1_trace_template,
+ TP_PROTO(const char *function, struct va_format *vaf),
+ TP_ARGS(function, vaf),
+ TP_STRUCT__entry(__string(function, function)
+ __dynamic_array(char, msg, MAX_MSG_LEN)
+ ),
+ TP_fast_assign(__assign_str(function, function);
+ WARN_ON_ONCE(vsnprintf
+ (__get_dynamic_array(msg),
+ MAX_MSG_LEN, vaf->fmt,
+ *vaf->va) >=
+ MAX_MSG_LEN);
+ ),
+ TP_printk("(%s) %s",
+ __get_str(function),
+ __get_str(msg))
+);
+
+/*
+ * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
+ * actual function to work and can not be in a macro.
+ */
+#define __hfi1_trace_def(lvl) \
+void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \
+ \
+DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \
+ TP_PROTO(const char *function, struct va_format *vaf), \
+ TP_ARGS(function, vaf))
+
+#define __hfi1_trace_fn(lvl) \
+void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \
+{ \
+ struct va_format vaf = { \
+ .fmt = fmt, \
+ }; \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ vaf.va = &args; \
+ trace_hfi1_ ##lvl(func, &vaf); \
+ va_end(args); \
+ return; \
+}
+
+/*
+ * To create a new trace level simply define it below and as a __hfi1_trace_fn
+ * in trace.c. This will create all the hooks for calling
+ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
+ * the debugfs stuff.
+ */
+__hfi1_trace_def(PKT);
+__hfi1_trace_def(PROC);
+__hfi1_trace_def(SDMA);
+__hfi1_trace_def(LINKVERB);
+__hfi1_trace_def(DEBUG);
+__hfi1_trace_def(SNOOP);
+__hfi1_trace_def(CNTR);
+__hfi1_trace_def(PIO);
+__hfi1_trace_def(DC8051);
+__hfi1_trace_def(FIRMWARE);
+__hfi1_trace_def(RCVCTRL);
+__hfi1_trace_def(TID);
+__hfi1_trace_def(MMU);
+__hfi1_trace_def(IOCTL);
+
+#define hfi1_cdbg(which, fmt, ...) \
+ __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+
+#define hfi1_dbg(fmt, ...) \
+ hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
+
+/*
+ * Define HFI1_EARLY_DBG at compile time or here to enable early trace
+ * messages. Do not check in an enablement for this.
+ */
+
+#ifdef HFI1_EARLY_DBG
+#define hfi1_dbg_early(fmt, ...) \
+ trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define hfi1_dbg_early(fmt, ...)
+#endif
+
+#endif /* __HFI1_TRACE_EXTRA_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_dbg
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
new file mode 100644
index 000000000000..c3e41aed0034
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_IBHDRS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_IBHDRS_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ibhdrs
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
+const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
+
+#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
+
+#define lrh_name(lrh) { HFI1_##lrh, #lrh }
+#define show_lnh(lrh) \
+__print_symbolic(lrh, \
+ lrh_name(LRH_BTH), \
+ lrh_name(LRH_GRH))
+
+#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
+#define BTH_PRN \
+ "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
+ "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
+#define EHDR_PRN "%s"
+
+DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
+ TP_PROTO(struct hfi1_devdata *dd,
+ struct hfi1_ib_header *hdr),
+ TP_ARGS(dd, hdr),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ /* LRH */
+ __field(u8, vl)
+ __field(u8, lver)
+ __field(u8, sl)
+ __field(u8, lnh)
+ __field(u16, dlid)
+ __field(u16, len)
+ __field(u16, slid)
+ /* BTH */
+ __field(u8, opcode)
+ __field(u8, se)
+ __field(u8, m)
+ __field(u8, pad)
+ __field(u8, tver)
+ __field(u16, pkey)
+ __field(u8, f)
+ __field(u8, b)
+ __field(u32, qpn)
+ __field(u8, a)
+ __field(u32, psn)
+ /* extended headers */
+ __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
+ ),
+ TP_fast_assign(
+ struct hfi1_other_headers *ohdr;
+
+ DD_DEV_ASSIGN(dd);
+ /* LRH */
+ __entry->vl =
+ (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
+ __entry->lver =
+ (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
+ __entry->sl =
+ (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+ __entry->lnh =
+ (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+ __entry->dlid =
+ be16_to_cpu(hdr->lrh[1]);
+ /* allow for larger len */
+ __entry->len =
+ be16_to_cpu(hdr->lrh[2]);
+ __entry->slid =
+ be16_to_cpu(hdr->lrh[3]);
+ /* BTH */
+ if (__entry->lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else
+ ohdr = &hdr->u.l.oth;
+ __entry->opcode =
+ (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+ __entry->se =
+ (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
+ __entry->m =
+ (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
+ __entry->pad =
+ (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ __entry->tver =
+ (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
+ __entry->pkey =
+ be32_to_cpu(ohdr->bth[0]) & 0xffff;
+ __entry->f =
+ (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
+ HFI1_FECN_MASK;
+ __entry->b =
+ (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
+ HFI1_BECN_MASK;
+ __entry->qpn =
+ be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+ __entry->a =
+ (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
+ /* allow for larger PSN */
+ __entry->psn =
+ be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
+ /* extended headers */
+ memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
+ ibhdr_exhdr_len(hdr));
+ ),
+ TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
+ __get_str(dev),
+ /* LRH */
+ __entry->vl,
+ __entry->lver,
+ __entry->sl,
+ __entry->lnh, show_lnh(__entry->lnh),
+ __entry->dlid,
+ __entry->len,
+ __entry->slid,
+ /* BTH */
+ __entry->opcode, show_ib_opcode(__entry->opcode),
+ __entry->se,
+ __entry->m,
+ __entry->pad,
+ __entry->tver,
+ __entry->pkey,
+ __entry->f,
+ __entry->b,
+ __entry->qpn,
+ __entry->a,
+ __entry->psn,
+ /* extended headers */
+ __parse_ib_ehdrs(
+ __entry->opcode,
+ (void *)__get_dynamic_array(ehdrs))
+ )
+);
+
+DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
+ TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+ TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
+ TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+ TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
+ TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+ TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
+ TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+ TP_ARGS(dd, hdr));
+
+#endif /* __HFI1_TRACE_IBHDRS_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_ibhdrs
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_misc.h b/drivers/infiniband/hw/hfi1/trace_misc.h
new file mode 100644
index 000000000000..d308454af7fd
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_misc.h
@@ -0,0 +1,81 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license. When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* - Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+* - Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in
+* the documentation and/or other materials provided with the
+* distribution.
+* - Neither the name of Intel Corporation nor the names of its
+* contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_MISC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_MISC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_misc
+
+TRACE_EVENT(hfi1_interrupt,
+ TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
+ int src),
+ TP_ARGS(dd, is_entry, src),
+ TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+ __array(char, buf, 64)
+ __field(int, src)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(dd)
+ is_entry->is_name(__entry->buf, 64,
+ src - is_entry->start);
+ __entry->src = src;
+ ),
+ TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
+ __entry->src)
+);
+
+#endif /* __HFI1_TRACE_MISC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_misc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h
new file mode 100644
index 000000000000..5ea5005f9f41
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_rc.h
@@ -0,0 +1,123 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license. When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+* - Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+* - Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in
+* the documentation and/or other materials provided with the
+* distribution.
+* - Neither the name of Intel Corporation nor the names of its
+* contributors may be used to endorse or promote products derived
+* from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_RC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_RC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rc
+
+DECLARE_EVENT_CLASS(hfi1_rc_template,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, s_flags)
+ __field(u32, psn)
+ __field(u32, s_psn)
+ __field(u32, s_next_psn)
+ __field(u32, s_sending_psn)
+ __field(u32, s_sending_hpsn)
+ __field(u32, r_psn)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->s_flags = qp->s_flags;
+ __entry->psn = psn;
+ __entry->s_psn = qp->s_psn;
+ __entry->s_next_psn = qp->s_next_psn;
+ __entry->s_sending_psn = qp->s_sending_psn;
+ __entry->s_sending_hpsn = qp->s_sending_hpsn;
+ __entry->r_psn = qp->r_psn;
+ ),
+ TP_printk(
+ "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->s_flags,
+ __entry->psn,
+ __entry->s_psn,
+ __entry->s_next_psn,
+ __entry->s_sending_psn,
+ __entry->s_sending_hpsn,
+ __entry->r_psn
+ )
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_sendcomplete,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_timeout,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+#endif /* __HFI1_TRACE_RC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h
new file mode 100644
index 000000000000..9ba1f615ec95
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_rx.h
@@ -0,0 +1,322 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_RX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_RX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rx
+
+TRACE_EVENT(hfi1_rcvhdr,
+ TP_PROTO(struct hfi1_devdata *dd,
+ u32 ctxt,
+ u64 eflags,
+ u32 etype,
+ u32 hlen,
+ u32 tlen,
+ u32 updegr,
+ u32 etail
+ ),
+ TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
+ TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+ __field(u64, eflags)
+ __field(u32, ctxt)
+ __field(u32, etype)
+ __field(u32, hlen)
+ __field(u32, tlen)
+ __field(u32, updegr)
+ __field(u32, etail)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(dd);
+ __entry->eflags = eflags;
+ __entry->ctxt = ctxt;
+ __entry->etype = etype;
+ __entry->hlen = hlen;
+ __entry->tlen = tlen;
+ __entry->updegr = updegr;
+ __entry->etail = etail;
+ ),
+ TP_printk(
+ "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->eflags,
+ __entry->etype, show_packettype(__entry->etype),
+ __entry->hlen,
+ __entry->tlen,
+ __entry->updegr,
+ __entry->etail
+ )
+);
+
+TRACE_EVENT(hfi1_receive_interrupt,
+ TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
+ TP_ARGS(dd, ctxt),
+ TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+ __field(u32, ctxt)
+ __field(u8, slow_path)
+ __field(u8, dma_rtail)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ if (dd->rcd[ctxt]->do_interrupt ==
+ &handle_receive_interrupt) {
+ __entry->slow_path = 1;
+ __entry->dma_rtail = 0xFF;
+ } else if (dd->rcd[ctxt]->do_interrupt ==
+ &handle_receive_interrupt_dma_rtail){
+ __entry->dma_rtail = 1;
+ __entry->slow_path = 0;
+ } else if (dd->rcd[ctxt]->do_interrupt ==
+ &handle_receive_interrupt_nodma_rtail) {
+ __entry->dma_rtail = 0;
+ __entry->slow_path = 0;
+ }
+ ),
+ TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->slow_path,
+ __entry->dma_rtail
+ )
+);
+
+TRACE_EVENT(hfi1_exp_tid_reg,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
+ u32 npages, unsigned long va, unsigned long pa,
+ dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+ TP_STRUCT__entry(
+ __field(unsigned int, ctxt)
+ __field(u16, subctxt)
+ __field(u32, rarr)
+ __field(u32, npages)
+ __field(unsigned long, va)
+ __field(unsigned long, pa)
+ __field(dma_addr_t, dma)
+ ),
+ TP_fast_assign(
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->rarr = rarr;
+ __entry->npages = npages;
+ __entry->va = va;
+ __entry->pa = pa;
+ __entry->dma = dma;
+ ),
+ TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->rarr,
+ __entry->npages,
+ __entry->pa,
+ __entry->va,
+ __entry->dma
+ )
+ );
+
+TRACE_EVENT(hfi1_exp_tid_unreg,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+ unsigned long va, unsigned long pa, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+ TP_STRUCT__entry(
+ __field(unsigned int, ctxt)
+ __field(u16, subctxt)
+ __field(u32, rarr)
+ __field(u32, npages)
+ __field(unsigned long, va)
+ __field(unsigned long, pa)
+ __field(dma_addr_t, dma)
+ ),
+ TP_fast_assign(
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->rarr = rarr;
+ __entry->npages = npages;
+ __entry->va = va;
+ __entry->pa = pa;
+ __entry->dma = dma;
+ ),
+ TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->rarr,
+ __entry->npages,
+ __entry->pa,
+ __entry->va,
+ __entry->dma
+ )
+ );
+
+TRACE_EVENT(hfi1_exp_tid_inval,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
+ u32 npages, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
+ TP_STRUCT__entry(
+ __field(unsigned int, ctxt)
+ __field(u16, subctxt)
+ __field(unsigned long, va)
+ __field(u32, rarr)
+ __field(u32, npages)
+ __field(dma_addr_t, dma)
+ ),
+ TP_fast_assign(
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->va = va;
+ __entry->rarr = rarr;
+ __entry->npages = npages;
+ __entry->dma = dma;
+ ),
+ TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->rarr,
+ __entry->npages,
+ __entry->va,
+ __entry->dma
+ )
+ );
+
+TRACE_EVENT(hfi1_mmu_invalidate,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type,
+ unsigned long start, unsigned long end),
+ TP_ARGS(ctxt, subctxt, type, start, end),
+ TP_STRUCT__entry(
+ __field(unsigned int, ctxt)
+ __field(u16, subctxt)
+ __string(type, type)
+ __field(unsigned long, start)
+ __field(unsigned long, end)
+ ),
+ TP_fast_assign(
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __assign_str(type, type);
+ __entry->start = start;
+ __entry->end = end;
+ ),
+ TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
+ __entry->ctxt,
+ __entry->subctxt,
+ __get_str(type),
+ __entry->start,
+ __entry->end
+ )
+ );
+
+#define SNOOP_PRN \
+ "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
+ "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
+
+TRACE_EVENT(snoop_capture,
+ TP_PROTO(struct hfi1_devdata *dd,
+ int hdr_len,
+ struct hfi1_ib_header *hdr,
+ int data_len,
+ void *data),
+ TP_ARGS(dd, hdr_len, hdr, data_len, data),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u16, slid)
+ __field(u16, dlid)
+ __field(u32, qpn)
+ __field(u8, opcode)
+ __field(u8, sl)
+ __field(u16, pkey)
+ __field(u32, hdr_len)
+ __field(u32, data_len)
+ __field(u8, lnh)
+ __dynamic_array(u8, raw_hdr, hdr_len)
+ __dynamic_array(u8, raw_pkt, data_len)
+ ),
+ TP_fast_assign(
+ struct hfi1_other_headers *ohdr;
+
+ __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+ if (__entry->lnh == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else
+ ohdr = &hdr->u.l.oth;
+ DD_DEV_ASSIGN(dd);
+ __entry->slid = be16_to_cpu(hdr->lrh[3]);
+ __entry->dlid = be16_to_cpu(hdr->lrh[1]);
+ __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+ __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+ __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+ __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
+ __entry->hdr_len = hdr_len;
+ __entry->data_len = data_len;
+ memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
+ memcpy(__get_dynamic_array(raw_pkt), data, data_len);
+ ),
+ TP_printk(
+ "[%s] " SNOOP_PRN,
+ __get_str(dev),
+ __entry->slid,
+ __entry->dlid,
+ __entry->qpn,
+ __entry->opcode,
+ show_ib_opcode(__entry->opcode),
+ __entry->sl,
+ __entry->pkey,
+ __entry->hdr_len,
+ __entry->data_len
+ )
+);
+
+#endif /* __HFI1_TRACE_RX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rx
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
new file mode 100644
index 000000000000..415d6be42c5d
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -0,0 +1,642 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_TX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "sdma.h"
+
+const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
+
+#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tx
+
+TRACE_EVENT(hfi1_piofree,
+ TP_PROTO(struct send_context *sc, int extra),
+ TP_ARGS(sc, extra),
+ TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+ __field(u32, sw_index)
+ __field(u32, hw_context)
+ __field(int, extra)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+ __entry->sw_index = sc->sw_index;
+ __entry->hw_context = sc->hw_context;
+ __entry->extra = extra;
+ ),
+ TP_printk("[%s] ctxt %u(%u) extra %d",
+ __get_str(dev),
+ __entry->sw_index,
+ __entry->hw_context,
+ __entry->extra
+ )
+);
+
+TRACE_EVENT(hfi1_wantpiointr,
+ TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
+ TP_ARGS(sc, needint, credit_ctrl),
+ TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+ __field(u32, sw_index)
+ __field(u32, hw_context)
+ __field(u32, needint)
+ __field(u64, credit_ctrl)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+ __entry->sw_index = sc->sw_index;
+ __entry->hw_context = sc->hw_context;
+ __entry->needint = needint;
+ __entry->credit_ctrl = credit_ctrl;
+ ),
+ TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
+ __get_str(dev),
+ __entry->sw_index,
+ __entry->hw_context,
+ __entry->needint,
+ (unsigned long long)__entry->credit_ctrl
+ )
+);
+
+DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
+ TP_PROTO(struct rvt_qp *qp, u32 flags),
+ TP_ARGS(qp, flags),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, flags)
+ __field(u32, s_flags)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->flags = flags;
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->s_flags = qp->s_flags;
+ ),
+ TP_printk(
+ "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->flags,
+ __entry->s_flags
+ )
+);
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
+ TP_PROTO(struct rvt_qp *qp, u32 flags),
+ TP_ARGS(qp, flags));
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
+ TP_PROTO(struct rvt_qp *qp, u32 flags),
+ TP_ARGS(qp, flags));
+
+TRACE_EVENT(hfi1_sdma_descriptor,
+ TP_PROTO(struct sdma_engine *sde,
+ u64 desc0,
+ u64 desc1,
+ u16 e,
+ void *descp),
+ TP_ARGS(sde, desc0, desc1, e, descp),
+ TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+ __field(void *, descp)
+ __field(u64, desc0)
+ __field(u64, desc1)
+ __field(u16, e)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+ __entry->desc0 = desc0;
+ __entry->desc1 = desc1;
+ __entry->idx = sde->this_idx;
+ __entry->descp = descp;
+ __entry->e = e;
+ ),
+ TP_printk(
+ "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
+ __get_str(dev),
+ __entry->idx,
+ __parse_sdma_flags(__entry->desc0, __entry->desc1),
+ (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
+ SDMA_DESC0_PHY_ADDR_MASK,
+ (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
+ SDMA_DESC1_GENERATION_MASK),
+ (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
+ SDMA_DESC0_BYTE_COUNT_MASK),
+ __entry->desc0,
+ __entry->desc1,
+ __entry->descp,
+ __entry->e
+ )
+);
+
+TRACE_EVENT(hfi1_sdma_engine_select,
+ TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
+ TP_ARGS(dd, sel, vl, idx),
+ TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+ __field(u32, sel)
+ __field(u8, vl)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(dd);
+ __entry->sel = sel;
+ __entry->vl = vl;
+ __entry->idx = idx;
+ ),
+ TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
+ __get_str(dev),
+ __entry->idx,
+ __entry->sel,
+ __entry->vl
+ )
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
+ TP_PROTO(struct sdma_engine *sde, u64 status),
+ TP_ARGS(sde, status),
+ TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+ __field(u64, status)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+ __entry->status = status;
+ __entry->idx = sde->this_idx;
+ ),
+ TP_printk("[%s] SDE(%u) status %llx",
+ __get_str(dev),
+ __entry->idx,
+ (unsigned long long)__entry->status
+ )
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
+ TP_PROTO(struct sdma_engine *sde, u64 status),
+ TP_ARGS(sde, status)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
+ TP_PROTO(struct sdma_engine *sde, u64 status),
+ TP_ARGS(sde, status)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
+ TP_PROTO(struct sdma_engine *sde, int aidx),
+ TP_ARGS(sde, aidx),
+ TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+ __field(int, aidx)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+ __entry->idx = sde->this_idx;
+ __entry->aidx = aidx;
+ ),
+ TP_printk("[%s] SDE(%u) aidx %d",
+ __get_str(dev),
+ __entry->idx,
+ __entry->aidx
+ )
+);
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
+ TP_PROTO(struct sdma_engine *sde, int aidx),
+ TP_ARGS(sde, aidx));
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
+ TP_PROTO(struct sdma_engine *sde, int aidx),
+ TP_ARGS(sde, aidx));
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+TRACE_EVENT(hfi1_sdma_progress,
+ TP_PROTO(struct sdma_engine *sde,
+ u16 hwhead,
+ u16 swhead,
+ struct sdma_txreq *txp
+ ),
+ TP_ARGS(sde, hwhead, swhead, txp),
+ TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+ __field(u64, sn)
+ __field(u16, hwhead)
+ __field(u16, swhead)
+ __field(u16, txnext)
+ __field(u16, tx_tail)
+ __field(u16, tx_head)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+ __entry->hwhead = hwhead;
+ __entry->swhead = swhead;
+ __entry->tx_tail = sde->tx_tail;
+ __entry->tx_head = sde->tx_head;
+ __entry->txnext = txp ? txp->next_descq_idx : ~0;
+ __entry->idx = sde->this_idx;
+ __entry->sn = txp ? txp->sn : ~0;
+ ),
+ TP_printk(
+ "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+ __get_str(dev),
+ __entry->idx,
+ __entry->sn,
+ __entry->hwhead,
+ __entry->swhead,
+ __entry->txnext,
+ __entry->tx_head,
+ __entry->tx_tail
+ )
+);
+#else
+TRACE_EVENT(hfi1_sdma_progress,
+ TP_PROTO(struct sdma_engine *sde,
+ u16 hwhead, u16 swhead,
+ struct sdma_txreq *txp
+ ),
+ TP_ARGS(sde, hwhead, swhead, txp),
+ TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+ __field(u16, hwhead)
+ __field(u16, swhead)
+ __field(u16, txnext)
+ __field(u16, tx_tail)
+ __field(u16, tx_head)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+ __entry->hwhead = hwhead;
+ __entry->swhead = swhead;
+ __entry->tx_tail = sde->tx_tail;
+ __entry->tx_head = sde->tx_head;
+ __entry->txnext = txp ? txp->next_descq_idx : ~0;
+ __entry->idx = sde->this_idx;
+ ),
+ TP_printk(
+ "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+ __get_str(dev),
+ __entry->idx,
+ __entry->hwhead,
+ __entry->swhead,
+ __entry->txnext,
+ __entry->tx_head,
+ __entry->tx_tail
+ )
+);
+#endif
+
+DECLARE_EVENT_CLASS(hfi1_sdma_sn,
+ TP_PROTO(struct sdma_engine *sde, u64 sn),
+ TP_ARGS(sde, sn),
+ TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+ __field(u64, sn)
+ __field(u8, idx)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+ __entry->sn = sn;
+ __entry->idx = sde->this_idx;
+ ),
+ TP_printk("[%s] SDE(%u) sn %llu",
+ __get_str(dev),
+ __entry->idx,
+ __entry->sn
+ )
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
+ TP_PROTO(
+ struct sdma_engine *sde,
+ u64 sn
+ ),
+ TP_ARGS(sde, sn)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
+ TP_PROTO(struct sdma_engine *sde, u64 sn),
+ TP_ARGS(sde, sn)
+);
+
+#define USDMA_HDR_FORMAT \
+ "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
+
+TRACE_EVENT(hfi1_sdma_user_header,
+ TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+ struct hfi1_pkt_header *hdr, u32 tidval),
+ TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u16, ctxt)
+ __field(u8, subctxt)
+ __field(u16, req)
+ __field(u32, pbc0)
+ __field(u32, pbc1)
+ __field(u32, lrh0)
+ __field(u32, lrh1)
+ __field(u32, bth0)
+ __field(u32, bth1)
+ __field(u32, bth2)
+ __field(u32, kdeth0)
+ __field(u32, kdeth1)
+ __field(u32, kdeth2)
+ __field(u32, kdeth3)
+ __field(u32, kdeth4)
+ __field(u32, kdeth5)
+ __field(u32, kdeth6)
+ __field(u32, kdeth7)
+ __field(u32, kdeth8)
+ __field(u32, tidval)
+ ),
+ TP_fast_assign(
+ __le32 *pbc = (__le32 *)hdr->pbc;
+ __be32 *lrh = (__be32 *)hdr->lrh;
+ __be32 *bth = (__be32 *)hdr->bth;
+ __le32 *kdeth = (__le32 *)&hdr->kdeth;
+
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->req = req;
+ __entry->pbc0 = le32_to_cpu(pbc[0]);
+ __entry->pbc1 = le32_to_cpu(pbc[1]);
+ __entry->lrh0 = be32_to_cpu(lrh[0]);
+ __entry->lrh1 = be32_to_cpu(lrh[1]);
+ __entry->bth0 = be32_to_cpu(bth[0]);
+ __entry->bth1 = be32_to_cpu(bth[1]);
+ __entry->bth2 = be32_to_cpu(bth[2]);
+ __entry->kdeth0 = le32_to_cpu(kdeth[0]);
+ __entry->kdeth1 = le32_to_cpu(kdeth[1]);
+ __entry->kdeth2 = le32_to_cpu(kdeth[2]);
+ __entry->kdeth3 = le32_to_cpu(kdeth[3]);
+ __entry->kdeth4 = le32_to_cpu(kdeth[4]);
+ __entry->kdeth5 = le32_to_cpu(kdeth[5]);
+ __entry->kdeth6 = le32_to_cpu(kdeth[6]);
+ __entry->kdeth7 = le32_to_cpu(kdeth[7]);
+ __entry->kdeth8 = le32_to_cpu(kdeth[8]);
+ __entry->tidval = tidval;
+ ),
+ TP_printk(USDMA_HDR_FORMAT,
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->req,
+ __entry->pbc1,
+ __entry->pbc0,
+ __entry->lrh0,
+ __entry->lrh1,
+ __entry->bth0,
+ __entry->bth1,
+ __entry->bth2,
+ __entry->kdeth0,
+ __entry->kdeth1,
+ __entry->kdeth2,
+ __entry->kdeth3,
+ __entry->kdeth4,
+ __entry->kdeth5,
+ __entry->kdeth6,
+ __entry->kdeth7,
+ __entry->kdeth8,
+ __entry->tidval
+ )
+);
+
+#define SDMA_UREQ_FMT \
+ "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
+TRACE_EVENT(hfi1_sdma_user_reqinfo,
+ TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
+ TP_ARGS(dd, ctxt, subctxt, i),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd);
+ __field(u16, ctxt)
+ __field(u8, subctxt)
+ __field(u8, ver_opcode)
+ __field(u8, iovcnt)
+ __field(u16, npkts)
+ __field(u16, fragsize)
+ __field(u16, comp_idx)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->ver_opcode = i[0] & 0xff;
+ __entry->iovcnt = (i[0] >> 8) & 0xff;
+ __entry->npkts = i[1];
+ __entry->fragsize = i[2];
+ __entry->comp_idx = i[3];
+ ),
+ TP_printk(SDMA_UREQ_FMT,
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->ver_opcode,
+ __entry->iovcnt,
+ __entry->npkts,
+ __entry->fragsize,
+ __entry->comp_idx
+ )
+);
+
+#define usdma_complete_name(st) { st, #st }
+#define show_usdma_complete_state(st) \
+ __print_symbolic(st, \
+ usdma_complete_name(FREE), \
+ usdma_complete_name(QUEUED), \
+ usdma_complete_name(COMPLETE), \
+ usdma_complete_name(ERROR))
+
+TRACE_EVENT(hfi1_sdma_user_completion,
+ TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
+ u8 state, int code),
+ TP_ARGS(dd, ctxt, subctxt, idx, state, code),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u16, ctxt)
+ __field(u8, subctxt)
+ __field(u16, idx)
+ __field(u8, state)
+ __field(int, code)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->idx = idx;
+ __entry->state = state;
+ __entry->code = code;
+ ),
+ TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
+ __get_str(dev), __entry->ctxt, __entry->subctxt,
+ __entry->idx, show_usdma_complete_state(__entry->state),
+ __entry->code)
+);
+
+const char *print_u32_array(struct trace_seq *, u32 *, int);
+#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
+
+TRACE_EVENT(hfi1_sdma_user_header_ahg,
+ TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+ u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
+ TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
+ TP_STRUCT__entry(
+ DD_DEV_ENTRY(dd)
+ __field(u16, ctxt)
+ __field(u8, subctxt)
+ __field(u16, req)
+ __field(u8, sde)
+ __field(u8, idx)
+ __field(int, len)
+ __field(u32, tidval)
+ __array(u32, ahg, 10)
+ ),
+ TP_fast_assign(
+ DD_DEV_ASSIGN(dd);
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->req = req;
+ __entry->sde = sde;
+ __entry->idx = ahgidx;
+ __entry->len = len;
+ __entry->tidval = tidval;
+ memcpy(__entry->ahg, ahg, len * sizeof(u32));
+ ),
+ TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
+ __get_str(dev),
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->req,
+ __entry->sde,
+ __entry->idx,
+ __entry->len - 1,
+ __print_u32_hex(__entry->ahg, __entry->len),
+ __entry->tidval
+ )
+);
+
+TRACE_EVENT(hfi1_sdma_state,
+ TP_PROTO(struct sdma_engine *sde,
+ const char *cstate,
+ const char *nstate
+ ),
+ TP_ARGS(sde, cstate, nstate),
+ TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+ __string(curstate, cstate)
+ __string(newstate, nstate)
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+ __assign_str(curstate, cstate);
+ __assign_str(newstate, nstate);
+ ),
+ TP_printk("[%s] current state %s new state %s",
+ __get_str(dev),
+ __get_str(curstate),
+ __get_str(newstate)
+ )
+);
+
+#define BCT_FORMAT \
+ "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
+
+#define BCT(field) \
+ be16_to_cpu( \
+ ((struct buffer_control *)__get_dynamic_array(bct))->field \
+ )
+
+DECLARE_EVENT_CLASS(hfi1_bct_template,
+ TP_PROTO(struct hfi1_devdata *dd,
+ struct buffer_control *bc),
+ TP_ARGS(dd, bc),
+ TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+ __dynamic_array(u8, bct, sizeof(*bc))
+ ),
+ TP_fast_assign(DD_DEV_ASSIGN(dd);
+ memcpy(__get_dynamic_array(bct), bc,
+ sizeof(*bc));
+ ),
+ TP_printk(BCT_FORMAT,
+ BCT(overall_shared_limit),
+
+ BCT(vl[0].dedicated),
+ BCT(vl[0].shared),
+
+ BCT(vl[1].dedicated),
+ BCT(vl[1].shared),
+
+ BCT(vl[2].dedicated),
+ BCT(vl[2].shared),
+
+ BCT(vl[3].dedicated),
+ BCT(vl[3].shared),
+
+ BCT(vl[4].dedicated),
+ BCT(vl[4].shared),
+
+ BCT(vl[5].dedicated),
+ BCT(vl[5].shared),
+
+ BCT(vl[6].dedicated),
+ BCT(vl[6].shared),
+
+ BCT(vl[7].dedicated),
+ BCT(vl[7].shared),
+
+ BCT(vl[15].dedicated),
+ BCT(vl[15].shared)
+ )
+);
+
+DEFINE_EVENT(hfi1_bct_template, bct_set,
+ TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+ TP_ARGS(dd, bc));
+
+DEFINE_EVENT(hfi1_bct_template, bct_get,
+ TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+ TP_ARGS(dd, bc));
+
+#endif /* __HFI1_TRACE_TX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tx
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
new file mode 100644
index 000000000000..a726d96d185f
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -0,0 +1,595 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/* only opcode mask for adaptive pio */
+const u32 uc_only_opcode =
+ BIT(OP(SEND_ONLY) & 0x1f) |
+ BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) |
+ BIT(OP(RDMA_WRITE_ONLY & 0x1f)) |
+ BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f));
+
+/**
+ * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Assume s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_other_headers *ohdr;
+ struct rvt_swqe *wqe;
+ u32 hwords = 5;
+ u32 bth0 = 0;
+ u32 len;
+ u32 pmtu = qp->pmtu;
+ int middle = 0;
+
+ ps->s_txreq = get_txreq(ps->dev, qp);
+ if (IS_ERR(ps->s_txreq))
+ goto bail_no_tx;
+
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+ if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ smp_read_barrier_depends(); /* see post_one_send() */
+ if (qp->s_last == ACCESS_ONCE(qp->s_head))
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (iowait_sdma_pending(&priv->s_iowait)) {
+ qp->s_flags |= RVT_S_WAIT_DMA;
+ goto bail;
+ }
+ clear_ahg(qp);
+ wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+ hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+ goto done_free_tx;
+ }
+
+ ohdr = &ps->s_txreq->phdr.hdr.u.oth;
+ if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+ ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
+
+ /* Get the next send request. */
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+ qp->s_wqe = NULL;
+ switch (qp->s_state) {
+ default:
+ if (!(ib_rvt_state_ops[qp->state] &
+ RVT_PROCESS_NEXT_SEND_OK))
+ goto bail;
+ /* Check if send work queue is empty. */
+ smp_read_barrier_depends(); /* see post_one_send() */
+ if (qp->s_cur == ACCESS_ONCE(qp->s_head)) {
+ clear_ahg(qp);
+ goto bail;
+ }
+ /*
+ * Local operations are processed immediately
+ * after all prior requests have completed.
+ */
+ if (wqe->wr.opcode == IB_WR_REG_MR ||
+ wqe->wr.opcode == IB_WR_LOCAL_INV) {
+ int local_ops = 0;
+ int err = 0;
+
+ if (qp->s_last != qp->s_cur)
+ goto bail;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+ err = rvt_invalidate_rkey(
+ qp, wqe->wr.ex.invalidate_rkey);
+ local_ops = 1;
+ }
+ hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
+ : IB_WC_SUCCESS);
+ if (local_ops)
+ atomic_dec(&qp->local_ops_pending);
+ qp->s_hdrwords = 0;
+ goto done_free_tx;
+ }
+ /*
+ * Start a new request.
+ */
+ qp->s_psn = wqe->psn;
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ len = wqe->length;
+ qp->s_len = len;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ if (len > pmtu) {
+ qp->s_state = OP(SEND_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND) {
+ qp->s_state = OP(SEND_ONLY);
+ } else {
+ qp->s_state =
+ OP(SEND_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ ohdr->u.rc.reth.vaddr =
+ cpu_to_be64(wqe->rdma_wr.remote_addr);
+ ohdr->u.rc.reth.rkey =
+ cpu_to_be32(wqe->rdma_wr.rkey);
+ ohdr->u.rc.reth.length = cpu_to_be32(len);
+ hwords += sizeof(struct ib_reth) / 4;
+ if (len > pmtu) {
+ qp->s_state = OP(RDMA_WRITE_FIRST);
+ len = pmtu;
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ qp->s_state = OP(RDMA_WRITE_ONLY);
+ } else {
+ qp->s_state =
+ OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+ /* Immediate data comes after the RETH */
+ ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ }
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ default:
+ goto bail;
+ }
+ break;
+
+ case OP(SEND_FIRST):
+ qp->s_state = OP(SEND_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND) {
+ qp->s_state = OP(SEND_LAST);
+ } else {
+ qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+
+ case OP(RDMA_WRITE_FIRST):
+ qp->s_state = OP(RDMA_WRITE_MIDDLE);
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_MIDDLE):
+ len = qp->s_len;
+ if (len > pmtu) {
+ len = pmtu;
+ middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+ break;
+ }
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ qp->s_state = OP(RDMA_WRITE_LAST);
+ } else {
+ qp->s_state =
+ OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+ /* Immediate data comes after the BTH */
+ ohdr->u.imm_data = wqe->wr.ex.imm_data;
+ hwords += 1;
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ }
+ qp->s_wqe = wqe;
+ if (++qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ break;
+ }
+ qp->s_len -= len;
+ qp->s_hdrwords = hwords;
+ ps->s_txreq->sde = priv->s_sde;
+ qp->s_cur_sge = &qp->s_sge;
+ qp->s_cur_size = len;
+ hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+ mask_psn(qp->s_psn++), middle, ps);
+ /* pbc */
+ ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+ return 1;
+
+done_free_tx:
+ hfi1_put_txreq(ps->s_txreq);
+ ps->s_txreq = NULL;
+ return 1;
+
+bail:
+ hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+ ps->s_txreq = NULL;
+ qp->s_flags &= ~RVT_S_BUSY;
+ qp->s_hdrwords = 0;
+ return 0;
+}
+
+/**
+ * hfi1_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_uc_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+ struct hfi1_ib_header *hdr = packet->hdr;
+ u32 rcv_flags = packet->rcv_flags;
+ void *data = packet->ebuf;
+ u32 tlen = packet->tlen;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_other_headers *ohdr = packet->ohdr;
+ u32 bth0, opcode;
+ u32 hdrsize = packet->hlen;
+ u32 psn;
+ u32 pad;
+ struct ib_wc wc;
+ u32 pmtu = qp->pmtu;
+ struct ib_reth *reth;
+ int has_grh = rcv_flags & HFI1_HAS_GRH;
+ int ret;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
+ return;
+
+ process_ecn(qp, packet, true);
+
+ psn = be32_to_cpu(ohdr->bth[2]);
+ opcode = (bth0 >> 24) & 0xff;
+
+ /* Compare the PSN verses the expected PSN. */
+ if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
+ /*
+ * Handle a sequence error.
+ * Silently drop any current message.
+ */
+ qp->r_psn = psn;
+inv:
+ if (qp->r_state == OP(SEND_FIRST) ||
+ qp->r_state == OP(SEND_MIDDLE)) {
+ set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+ qp->r_sge.num_sge = 0;
+ } else {
+ rvt_put_ss(&qp->r_sge);
+ }
+ qp->r_state = OP(SEND_LAST);
+ switch (opcode) {
+ case OP(SEND_FIRST):
+ case OP(SEND_ONLY):
+ case OP(SEND_ONLY_WITH_IMMEDIATE):
+ goto send_first;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_ONLY):
+ case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+ goto rdma_first;
+
+ default:
+ goto drop;
+ }
+ }
+
+ /* Check for opcode sequence errors. */
+ switch (qp->r_state) {
+ case OP(SEND_FIRST):
+ case OP(SEND_MIDDLE):
+ if (opcode == OP(SEND_MIDDLE) ||
+ opcode == OP(SEND_LAST) ||
+ opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+ break;
+ goto inv;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_MIDDLE):
+ if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+ opcode == OP(RDMA_WRITE_LAST) ||
+ opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+ break;
+ goto inv;
+
+ default:
+ if (opcode == OP(SEND_FIRST) ||
+ opcode == OP(SEND_ONLY) ||
+ opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+ opcode == OP(RDMA_WRITE_FIRST) ||
+ opcode == OP(RDMA_WRITE_ONLY) ||
+ opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+ break;
+ goto inv;
+ }
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ qp_comm_est(qp);
+
+ /* OK, process the packet. */
+ switch (opcode) {
+ case OP(SEND_FIRST):
+ case OP(SEND_ONLY):
+ case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+ if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
+ qp->r_sge = qp->s_rdma_read_sge;
+ } else {
+ ret = hfi1_rvt_get_rwqe(qp, 0);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto drop;
+ /*
+ * qp->s_rdma_read_sge will be the owner
+ * of the mr references.
+ */
+ qp->s_rdma_read_sge = qp->r_sge;
+ }
+ qp->r_rcv_len = 0;
+ if (opcode == OP(SEND_ONLY))
+ goto no_immediate_data;
+ else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+ goto send_last_imm;
+ /* FALLTHROUGH */
+ case OP(SEND_MIDDLE):
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto rewind;
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len))
+ goto rewind;
+ hfi1_copy_sge(&qp->r_sge, data, pmtu, 0, 0);
+ break;
+
+ case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+ wc.ex.imm_data = ohdr->u.imm_data;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ goto send_last;
+ case OP(SEND_LAST):
+no_immediate_data:
+ wc.ex.imm_data = 0;
+ wc.wc_flags = 0;
+send_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto rewind;
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ wc.byte_len = tlen + qp->r_rcv_len;
+ if (unlikely(wc.byte_len > qp->r_len))
+ goto rewind;
+ wc.opcode = IB_WC_RECV;
+ hfi1_copy_sge(&qp->r_sge, data, tlen, 0, 0);
+ rvt_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = qp->remote_ah_attr.dlid;
+ /*
+ * It seems that IB mandates the presence of an SL in a
+ * work completion only for the UD transport (see section
+ * 11.4.2 of IBTA Vol. 1).
+ *
+ * However, the way the SL is chosen below is consistent
+ * with the way that IB/qib works and is trying avoid
+ * introducing incompatibilities.
+ *
+ * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+ */
+ wc.sl = qp->remote_ah_attr.sl;
+ /* zero fields that are N/A */
+ wc.vendor_err = 0;
+ wc.pkey_index = 0;
+ wc.dlid_path_bits = 0;
+ wc.port_num = 0;
+ /* Signal completion event if the solicited bit is set. */
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+ (ohdr->bth[0] &
+ cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+ break;
+
+ case OP(RDMA_WRITE_FIRST):
+ case OP(RDMA_WRITE_ONLY):
+ case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+ if (unlikely(!(qp->qp_access_flags &
+ IB_ACCESS_REMOTE_WRITE))) {
+ goto drop;
+ }
+ reth = &ohdr->u.rc.reth;
+ qp->r_len = be32_to_cpu(reth->length);
+ qp->r_rcv_len = 0;
+ qp->r_sge.sg_list = NULL;
+ if (qp->r_len != 0) {
+ u32 rkey = be32_to_cpu(reth->rkey);
+ u64 vaddr = be64_to_cpu(reth->vaddr);
+ int ok;
+
+ /* Check rkey */
+ ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+ vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+ if (unlikely(!ok))
+ goto drop;
+ qp->r_sge.num_sge = 1;
+ } else {
+ qp->r_sge.num_sge = 0;
+ qp->r_sge.sge.mr = NULL;
+ qp->r_sge.sge.vaddr = NULL;
+ qp->r_sge.sge.length = 0;
+ qp->r_sge.sge.sge_length = 0;
+ }
+ if (opcode == OP(RDMA_WRITE_ONLY)) {
+ goto rdma_last;
+ } else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+ wc.ex.imm_data = ohdr->u.rc.imm_data;
+ goto rdma_last_imm;
+ }
+ /* FALLTHROUGH */
+ case OP(RDMA_WRITE_MIDDLE):
+ /* Check for invalid length PMTU or posted rwqe len. */
+ if (unlikely(tlen != (hdrsize + pmtu + 4)))
+ goto drop;
+ qp->r_rcv_len += pmtu;
+ if (unlikely(qp->r_rcv_len > qp->r_len))
+ goto drop;
+ hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
+ break;
+
+ case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+ wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+ wc.wc_flags = IB_WC_WITH_IMM;
+
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto drop;
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+ goto drop;
+ if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) {
+ rvt_put_ss(&qp->s_rdma_read_sge);
+ } else {
+ ret = hfi1_rvt_get_rwqe(qp, 1);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto drop;
+ }
+ wc.byte_len = qp->r_len;
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
+ rvt_put_ss(&qp->r_sge);
+ goto last_imm;
+
+ case OP(RDMA_WRITE_LAST):
+rdma_last:
+ /* Get the number of bytes the message was padded by. */
+ pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ /* Check for invalid length. */
+ /* LAST len should be >= 1 */
+ if (unlikely(tlen < (hdrsize + pad + 4)))
+ goto drop;
+ /* Don't count the CRC. */
+ tlen -= (hdrsize + pad + 4);
+ if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+ goto drop;
+ hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0);
+ rvt_put_ss(&qp->r_sge);
+ break;
+
+ default:
+ /* Drop packet for unknown opcodes. */
+ goto drop;
+ }
+ qp->r_psn++;
+ qp->r_state = opcode;
+ return;
+
+rewind:
+ set_bit(RVT_R_REWIND_SGE, &qp->r_aflags);
+ qp->r_sge.num_sge = 0;
+drop:
+ ibp->rvp.n_pkt_drops++;
+ return;
+
+op_err:
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+}
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
new file mode 100644
index 000000000000..f01e8e1d62d3
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -0,0 +1,866 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+
+/**
+ * ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from hfi1_make_ud_req() to forward a WQE addressed
+ * to the same HFI.
+ * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
+ * while this is being called.
+ */
+static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
+{
+ struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+ struct hfi1_pportdata *ppd;
+ struct rvt_qp *qp;
+ struct ib_ah_attr *ah_attr;
+ unsigned long flags;
+ struct rvt_sge_state ssge;
+ struct rvt_sge *sge;
+ struct ib_wc wc;
+ u32 length;
+ enum ib_qp_type sqptype, dqptype;
+
+ rcu_read_lock();
+
+ qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
+ swqe->ud_wr.remote_qpn);
+ if (!qp) {
+ ibp->rvp.n_pkt_drops++;
+ rcu_read_unlock();
+ return;
+ }
+
+ sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+ IB_QPT_UD : sqp->ibqp.qp_type;
+ dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+ IB_QPT_UD : qp->ibqp.qp_type;
+
+ if (dqptype != sqptype ||
+ !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ ibp->rvp.n_pkt_drops++;
+ goto drop;
+ }
+
+ ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+ ppd = ppd_from_ibp(ibp);
+
+ if (qp->ibqp.qp_num > 1) {
+ u16 pkey;
+ u16 slid;
+ u8 sc5 = ibp->sl_to_sc[ah_attr->sl];
+
+ pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
+ slid = ppd->lid | (ah_attr->src_path_bits &
+ ((1 << ppd->lmc) - 1));
+ if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
+ qp->s_pkey_index, slid))) {
+ hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, pkey,
+ ah_attr->sl,
+ sqp->ibqp.qp_num, qp->ibqp.qp_num,
+ slid, ah_attr->dlid);
+ goto drop;
+ }
+ }
+
+ /*
+ * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+ * Qkeys with the high order bit set mean use the
+ * qkey from the QP context instead of the WR (see 10.2.5).
+ */
+ if (qp->ibqp.qp_num) {
+ u32 qkey;
+
+ qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
+ sqp->qkey : swqe->ud_wr.remote_qkey;
+ if (unlikely(qkey != qp->qkey)) {
+ u16 lid;
+
+ lid = ppd->lid | (ah_attr->src_path_bits &
+ ((1 << ppd->lmc) - 1));
+ hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
+ ah_attr->sl,
+ sqp->ibqp.qp_num, qp->ibqp.qp_num,
+ lid,
+ ah_attr->dlid);
+ goto drop;
+ }
+ }
+
+ /*
+ * A GRH is expected to precede the data even if not
+ * present on the wire.
+ */
+ length = swqe->length;
+ memset(&wc, 0, sizeof(wc));
+ wc.byte_len = length + sizeof(struct ib_grh);
+
+ if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = swqe->wr.ex.imm_data;
+ }
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+
+ /*
+ * Get the next work request entry to find where to put the data.
+ */
+ if (qp->r_flags & RVT_R_REUSE_SGE) {
+ qp->r_flags &= ~RVT_R_REUSE_SGE;
+ } else {
+ int ret;
+
+ ret = hfi1_rvt_get_rwqe(qp, 0);
+ if (ret < 0) {
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ goto bail_unlock;
+ }
+ if (!ret) {
+ if (qp->ibqp.qp_num == 0)
+ ibp->rvp.n_vl15_dropped++;
+ goto bail_unlock;
+ }
+ }
+ /* Silently drop packets which are too big. */
+ if (unlikely(wc.byte_len > qp->r_len)) {
+ qp->r_flags |= RVT_R_REUSE_SGE;
+ ibp->rvp.n_pkt_drops++;
+ goto bail_unlock;
+ }
+
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ struct ib_grh grh;
+ struct ib_global_route grd = ah_attr->grh;
+
+ hfi1_make_grh(ibp, &grh, &grd, 0, 0);
+ hfi1_copy_sge(&qp->r_sge, &grh,
+ sizeof(grh), 1, 0);
+ wc.wc_flags |= IB_WC_GRH;
+ } else {
+ hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+ }
+ ssge.sg_list = swqe->sg_list + 1;
+ ssge.sge = *swqe->sg_list;
+ ssge.num_sge = swqe->wr.num_sge;
+ sge = &ssge.sge;
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ WARN_ON_ONCE(len == 0);
+ hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1, 0);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (--ssge.num_sge)
+ *sge = *ssge.sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+ rvt_put_ss(&qp->r_sge);
+ if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+ goto bail_unlock;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = sqp->ibqp.qp_num;
+ if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
+ if (sqp->ibqp.qp_type == IB_QPT_GSI ||
+ sqp->ibqp.qp_type == IB_QPT_SMI)
+ wc.pkey_index = swqe->ud_wr.pkey_index;
+ else
+ wc.pkey_index = sqp->s_pkey_index;
+ } else {
+ wc.pkey_index = 0;
+ }
+ wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
+ /* Check for loopback when the port lid is not set */
+ if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
+ wc.slid = be16_to_cpu(IB_LID_PERMISSIVE);
+ wc.sl = ah_attr->sl;
+ wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
+ wc.port_num = qp->port_num;
+ /* Signal completion event if the solicited bit is set. */
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+ swqe->wr.send_flags & IB_SEND_SOLICITED);
+ ibp->rvp.n_loop_pkts++;
+bail_unlock:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+ rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Assume s_lock is held.
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_other_headers *ohdr;
+ struct ib_ah_attr *ah_attr;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_ibport *ibp;
+ struct rvt_swqe *wqe;
+ u32 nwords;
+ u32 extra_bytes;
+ u32 bth0;
+ u16 lrh0;
+ u16 lid;
+ int next_cur;
+ u8 sc5;
+
+ ps->s_txreq = get_txreq(ps->dev, qp);
+ if (IS_ERR(ps->s_txreq))
+ goto bail_no_tx;
+
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+ if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ smp_read_barrier_depends(); /* see post_one_send */
+ if (qp->s_last == ACCESS_ONCE(qp->s_head))
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (iowait_sdma_pending(&priv->s_iowait)) {
+ qp->s_flags |= RVT_S_WAIT_DMA;
+ goto bail;
+ }
+ wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+ hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+ goto done_free_tx;
+ }
+
+ /* see post_one_send() */
+ smp_read_barrier_depends();
+ if (qp->s_cur == ACCESS_ONCE(qp->s_head))
+ goto bail;
+
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+ next_cur = qp->s_cur + 1;
+ if (next_cur >= qp->s_size)
+ next_cur = 0;
+
+ /* Construct the header. */
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ppd = ppd_from_ibp(ibp);
+ ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+ if (ah_attr->dlid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
+ ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
+ lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
+ if (unlikely(!loopback &&
+ (lid == ppd->lid ||
+ (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
+ qp->ibqp.qp_type == IB_QPT_GSI)))) {
+ unsigned long tflags = ps->flags;
+ /*
+ * If DMAs are in progress, we can't generate
+ * a completion for the loopback packet since
+ * it would be out of order.
+ * Instead of waiting, we could queue a
+ * zero length descriptor so we get a callback.
+ */
+ if (iowait_sdma_pending(&priv->s_iowait)) {
+ qp->s_flags |= RVT_S_WAIT_DMA;
+ goto bail;
+ }
+ qp->s_cur = next_cur;
+ spin_unlock_irqrestore(&qp->s_lock, tflags);
+ ud_loopback(qp, wqe);
+ spin_lock_irqsave(&qp->s_lock, tflags);
+ ps->flags = tflags;
+ hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+ goto done_free_tx;
+ }
+ }
+
+ qp->s_cur = next_cur;
+ extra_bytes = -wqe->length & 3;
+ nwords = (wqe->length + extra_bytes) >> 2;
+
+ /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+ qp->s_hdrwords = 7;
+ qp->s_cur_size = wqe->length;
+ qp->s_cur_sge = &qp->s_sge;
+ qp->s_srate = ah_attr->static_rate;
+ qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+ qp->s_wqe = wqe;
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ /* Header size in 32-bit words. */
+ qp->s_hdrwords += hfi1_make_grh(ibp,
+ &ps->s_txreq->phdr.hdr.u.l.grh,
+ &ah_attr->grh,
+ qp->s_hdrwords, nwords);
+ lrh0 = HFI1_LRH_GRH;
+ ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
+ /*
+ * Don't worry about sending to locally attached multicast
+ * QPs. It is unspecified by the spec. what happens.
+ */
+ } else {
+ /* Header size in 32-bit words. */
+ lrh0 = HFI1_LRH_BTH;
+ ohdr = &ps->s_txreq->phdr.hdr.u.oth;
+ }
+ if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+ qp->s_hdrwords++;
+ ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+ bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+ } else {
+ bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+ }
+ sc5 = ibp->sl_to_sc[ah_attr->sl];
+ lrh0 |= (ah_attr->sl & 0xf) << 4;
+ if (qp->ibqp.qp_type == IB_QPT_SMI) {
+ lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+ priv->s_sc = 0xf;
+ } else {
+ lrh0 |= (sc5 & 0xf) << 12;
+ priv->s_sc = sc5;
+ }
+ priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+ ps->s_txreq->sde = priv->s_sde;
+ priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+ ps->s_txreq->psc = priv->s_sendcontext;
+ ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
+ ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);
+ ps->s_txreq->phdr.hdr.lrh[2] =
+ cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+ if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
+ ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
+ } else {
+ lid = ppd->lid;
+ if (lid) {
+ lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
+ ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(lid);
+ } else {
+ ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE;
+ }
+ }
+ if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+ bth0 |= IB_BTH_SOLICITED;
+ bth0 |= extra_bytes << 20;
+ if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
+ bth0 |= hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+ else
+ bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+ ohdr->bth[0] = cpu_to_be32(bth0);
+ ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
+ ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
+ /*
+ * Qkeys with the high order bit set mean use the
+ * qkey from the QP context instead of the WR (see 10.2.5).
+ */
+ ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
+ qp->qkey : wqe->ud_wr.remote_qkey);
+ ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+ /* disarm any ahg */
+ priv->s_ahg->ahgcount = 0;
+ priv->s_ahg->ahgidx = 0;
+ priv->s_ahg->tx_flags = 0;
+ /* pbc */
+ ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
+
+ return 1;
+
+done_free_tx:
+ hfi1_put_txreq(ps->s_txreq);
+ ps->s_txreq = NULL;
+ return 1;
+
+bail:
+ hfi1_put_txreq(ps->s_txreq);
+
+bail_no_tx:
+ ps->s_txreq = NULL;
+ qp->s_flags &= ~RVT_S_BUSY;
+ qp->s_hdrwords = 0;
+ return 0;
+}
+
+/*
+ * Hardware can't check this so we do it here.
+ *
+ * This is a slightly different algorithm than the standard pkey check. It
+ * special cases the management keys and allows for 0x7fff and 0xffff to be in
+ * the table at the same time.
+ *
+ * @returns the index found or -1 if not found
+ */
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ unsigned i;
+
+ if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
+ unsigned lim_idx = -1;
+
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
+ /* here we look for an exact match */
+ if (ppd->pkeys[i] == pkey)
+ return i;
+ if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
+ lim_idx = i;
+ }
+
+ /* did not find 0xffff return 0x7fff idx if found */
+ if (pkey == FULL_MGMT_P_KEY)
+ return lim_idx;
+
+ /* no match... */
+ return -1;
+ }
+
+ pkey &= 0x7fff; /* remove limited/full membership bit */
+
+ for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
+ if ((ppd->pkeys[i] & 0x7fff) == pkey)
+ return i;
+
+ /*
+ * Should not get here, this means hardware failed to validate pkeys.
+ */
+ return -1;
+}
+
+void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
+ u32 pkey, u32 slid, u32 dlid, u8 sc5,
+ const struct ib_grh *old_grh)
+{
+ u64 pbc, pbc_flags = 0;
+ u32 bth0, plen, vl, hwords = 5;
+ u16 lrh0;
+ u8 sl = ibp->sc_to_sl[sc5];
+ struct hfi1_ib_header hdr;
+ struct hfi1_other_headers *ohdr;
+ struct pio_buf *pbuf;
+ struct send_context *ctxt = qp_to_send_context(qp, sc5);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ if (old_grh) {
+ struct ib_grh *grh = &hdr.u.l.grh;
+
+ grh->version_tclass_flow = old_grh->version_tclass_flow;
+ grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2);
+ grh->hop_limit = 0xff;
+ grh->sgid = old_grh->dgid;
+ grh->dgid = old_grh->sgid;
+ ohdr = &hdr.u.l.oth;
+ lrh0 = HFI1_LRH_GRH;
+ hwords += sizeof(struct ib_grh) / sizeof(u32);
+ } else {
+ ohdr = &hdr.u.oth;
+ lrh0 = HFI1_LRH_BTH;
+ }
+
+ lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
+
+ bth0 = pkey | (IB_OPCODE_CNP << 24);
+ ohdr->bth[0] = cpu_to_be32(bth0);
+
+ ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT));
+ ohdr->bth[2] = 0; /* PSN 0 */
+
+ hdr.lrh[0] = cpu_to_be16(lrh0);
+ hdr.lrh[1] = cpu_to_be16(dlid);
+ hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+ hdr.lrh[3] = cpu_to_be16(slid);
+
+ plen = 2 /* PBC */ + hwords;
+ pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+ vl = sc_to_vlt(ppd->dd, sc5);
+ pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+ if (ctxt) {
+ pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
+ if (pbuf)
+ ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+ &hdr, hwords);
+ }
+}
+
+/*
+ * opa_smp_check() - Do the regular pkey checking, and the additional
+ * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26
+ * ("SMA Packet Checks").
+ *
+ * Note that:
+ * - Checks are done using the pkey directly from the packet's BTH,
+ * and specifically _not_ the pkey that we attach to the completion,
+ * which may be different.
+ * - These checks are specifically for "non-local" SMPs (i.e., SMPs
+ * which originated on another node). SMPs which are sent from, and
+ * destined to this node are checked in opa_local_smp_check().
+ *
+ * At the point where opa_smp_check() is called, we know:
+ * - destination QP is QP0
+ *
+ * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
+ */
+static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
+ struct rvt_qp *qp, u16 slid, struct opa_smp *smp)
+{
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ /*
+ * I don't think it's possible for us to get here with sc != 0xf,
+ * but check it to be certain.
+ */
+ if (sc5 != 0xf)
+ return 1;
+
+ if (rcv_pkey_check(ppd, pkey, sc5, slid))
+ return 1;
+
+ /*
+ * At this point we know (and so don't need to check again) that
+ * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
+ * (see ingress_pkey_check).
+ */
+ if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
+ smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+ }
+
+ /*
+ * SMPs fall into one of four (disjoint) categories:
+ * SMA request, SMA response, trap, or trap repress.
+ * Our response depends, in part, on which type of
+ * SMP we're processing.
+ *
+ * If this is not an SMA request, or trap repress:
+ * - accept MAD if the port is running an SM
+ * - pkey == FULL_MGMT_P_KEY =>
+ * reply with unsupported method (i.e., just mark
+ * the smp's status field here, and let it be
+ * processed normally)
+ * - pkey != LIM_MGMT_P_KEY =>
+ * increment port recv constraint errors, drop MAD
+ * If this is an SMA request or trap repress:
+ * - pkey != FULL_MGMT_P_KEY =>
+ * increment port recv constraint errors, drop MAD
+ */
+ switch (smp->method) {
+ case IB_MGMT_METHOD_GET:
+ case IB_MGMT_METHOD_SET:
+ case IB_MGMT_METHOD_REPORT:
+ case IB_MGMT_METHOD_TRAP_REPRESS:
+ if (pkey != FULL_MGMT_P_KEY) {
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+ }
+ break;
+ case IB_MGMT_METHOD_SEND:
+ case IB_MGMT_METHOD_TRAP:
+ case IB_MGMT_METHOD_GET_RESP:
+ case IB_MGMT_METHOD_REPORT_RESP:
+ if (ibp->rvp.port_cap_flags & IB_PORT_SM)
+ return 0;
+ if (pkey == FULL_MGMT_P_KEY) {
+ smp->status |= IB_SMP_UNSUP_METHOD;
+ return 0;
+ }
+ if (pkey != LIM_MGMT_P_KEY) {
+ ingress_pkey_table_fail(ppd, pkey, slid);
+ return 1;
+ }
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+/**
+ * hfi1_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_ud_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_other_headers *ohdr = packet->ohdr;
+ int opcode;
+ u32 hdrsize = packet->hlen;
+ struct ib_wc wc;
+ u32 qkey;
+ u32 src_qp;
+ u16 dlid, pkey;
+ int mgmt_pkey_idx = -1;
+ struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_ib_header *hdr = packet->hdr;
+ u32 rcv_flags = packet->rcv_flags;
+ void *data = packet->ebuf;
+ u32 tlen = packet->tlen;
+ struct rvt_qp *qp = packet->qp;
+ bool has_grh = rcv_flags & HFI1_HAS_GRH;
+ u8 sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf);
+ u32 bth1;
+ u8 sl_from_sc, sl;
+ u16 slid;
+ u8 extra_bytes;
+
+ qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+ src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
+ dlid = be16_to_cpu(hdr->lrh[1]);
+ bth1 = be32_to_cpu(ohdr->bth[1]);
+ slid = be16_to_cpu(hdr->lrh[3]);
+ pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+ sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+ extra_bytes = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+ extra_bytes += (SIZE_OF_CRC << 2);
+ sl_from_sc = ibp->sc_to_sl[sc5];
+
+ opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+ opcode &= 0xff;
+
+ process_ecn(qp, packet, (opcode != IB_OPCODE_CNP));
+ /*
+ * Get the number of bytes the message was padded by
+ * and drop incomplete packets.
+ */
+ if (unlikely(tlen < (hdrsize + extra_bytes)))
+ goto drop;
+
+ tlen -= hdrsize + extra_bytes;
+
+ /*
+ * Check that the permissive LID is only used on QP0
+ * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+ */
+ if (qp->ibqp.qp_num) {
+ if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+ hdr->lrh[3] == IB_LID_PERMISSIVE))
+ goto drop;
+ if (qp->ibqp.qp_num > 1) {
+ if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
+ /*
+ * Traps will not be sent for packets dropped
+ * by the HW. This is fine, as sending trap
+ * for invalid pkeys is optional according to
+ * IB spec (release 1.3, section 10.9.4)
+ */
+ hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
+ pkey, sl,
+ src_qp, qp->ibqp.qp_num,
+ slid, dlid);
+ return;
+ }
+ } else {
+ /* GSI packet */
+ mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+ if (mgmt_pkey_idx < 0)
+ goto drop;
+ }
+ if (unlikely(qkey != qp->qkey)) {
+ hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, sl,
+ src_qp, qp->ibqp.qp_num,
+ slid, dlid);
+ return;
+ }
+ /* Drop invalid MAD packets (see 13.5.3.1). */
+ if (unlikely(qp->ibqp.qp_num == 1 &&
+ (tlen > 2048 || (sc5 == 0xF))))
+ goto drop;
+ } else {
+ /* Received on QP0, and so by definition, this is an SMP */
+ struct opa_smp *smp = (struct opa_smp *)data;
+
+ if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
+ goto drop;
+
+ if (tlen > 2048)
+ goto drop;
+ if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
+ hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+ smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ goto drop;
+
+ /* look up SMI pkey */
+ mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+ if (mgmt_pkey_idx < 0)
+ goto drop;
+ }
+
+ if (qp->ibqp.qp_num > 1 &&
+ opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+ wc.ex.imm_data = ohdr->u.ud.imm_data;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ tlen -= sizeof(u32);
+ } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+ wc.ex.imm_data = 0;
+ wc.wc_flags = 0;
+ } else {
+ goto drop;
+ }
+
+ /*
+ * A GRH is expected to precede the data even if not
+ * present on the wire.
+ */
+ wc.byte_len = tlen + sizeof(struct ib_grh);
+
+ /*
+ * Get the next work request entry to find where to put the data.
+ */
+ if (qp->r_flags & RVT_R_REUSE_SGE) {
+ qp->r_flags &= ~RVT_R_REUSE_SGE;
+ } else {
+ int ret;
+
+ ret = hfi1_rvt_get_rwqe(qp, 0);
+ if (ret < 0) {
+ hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ return;
+ }
+ if (!ret) {
+ if (qp->ibqp.qp_num == 0)
+ ibp->rvp.n_vl15_dropped++;
+ return;
+ }
+ }
+ /* Silently drop packets which are too big. */
+ if (unlikely(wc.byte_len > qp->r_len)) {
+ qp->r_flags |= RVT_R_REUSE_SGE;
+ goto drop;
+ }
+ if (has_grh) {
+ hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+ sizeof(struct ib_grh), 1, 0);
+ wc.wc_flags |= IB_WC_GRH;
+ } else {
+ hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+ }
+ hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
+ 1, 0);
+ rvt_put_ss(&qp->r_sge);
+ if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+ return;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.opcode = IB_WC_RECV;
+ wc.vendor_err = 0;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = src_qp;
+
+ if (qp->ibqp.qp_type == IB_QPT_GSI ||
+ qp->ibqp.qp_type == IB_QPT_SMI) {
+ if (mgmt_pkey_idx < 0) {
+ if (net_ratelimit()) {
+ struct hfi1_devdata *dd = ppd->dd;
+
+ dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
+ qp->ibqp.qp_type);
+ mgmt_pkey_idx = 0;
+ }
+ }
+ wc.pkey_index = (unsigned)mgmt_pkey_idx;
+ } else {
+ wc.pkey_index = 0;
+ }
+
+ wc.slid = slid;
+ wc.sl = sl_from_sc;
+
+ /*
+ * Save the LMC lower bits if the destination LID is a unicast LID.
+ */
+ wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 :
+ dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+ wc.port_num = qp->port_num;
+ /* Signal completion event if the solicited bit is set. */
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
+ (ohdr->bth[0] &
+ cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+ return;
+
+drop:
+ ibp->rvp.n_pkt_drops++;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
new file mode 100644
index 000000000000..64d26525435a
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -0,0 +1,1056 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <asm/page.h>
+
+#include "user_exp_rcv.h"
+#include "trace.h"
+#include "mmu_rb.h"
+
+struct tid_group {
+ struct list_head list;
+ unsigned base;
+ u8 size;
+ u8 used;
+ u8 map;
+};
+
+struct tid_rb_node {
+ struct mmu_rb_node mmu;
+ unsigned long phys;
+ struct tid_group *grp;
+ u32 rcventry;
+ dma_addr_t dma_addr;
+ bool freed;
+ unsigned npages;
+ struct page *pages[0];
+};
+
+struct tid_pageset {
+ u16 idx;
+ u16 count;
+};
+
+#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
+
+#define num_user_pages(vaddr, len) \
+ (1 + (((((unsigned long)(vaddr) + \
+ (unsigned long)(len) - 1) & PAGE_MASK) - \
+ ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
+ struct hfi1_filedata *);
+static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
+static int set_rcvarray_entry(struct file *, unsigned long, u32,
+ struct tid_group *, struct page **, unsigned);
+static int tid_rb_insert(void *, struct mmu_rb_node *);
+static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
+ struct tid_rb_node *tnode);
+static void tid_rb_remove(void *, struct mmu_rb_node *);
+static int tid_rb_invalidate(void *, struct mmu_rb_node *);
+static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
+ struct tid_pageset *, unsigned, u16, struct page **,
+ u32 *, unsigned *, unsigned *);
+static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
+
+static struct mmu_rb_ops tid_rb_ops = {
+ .insert = tid_rb_insert,
+ .remove = tid_rb_remove,
+ .invalidate = tid_rb_invalidate
+};
+
+static inline u32 rcventry2tidinfo(u32 rcventry)
+{
+ u32 pair = rcventry & ~0x1;
+
+ return EXP_TID_SET(IDX, pair >> 1) |
+ EXP_TID_SET(CTRL, 1 << (rcventry - pair));
+}
+
+static inline void exp_tid_group_init(struct exp_tid_set *set)
+{
+ INIT_LIST_HEAD(&set->list);
+ set->count = 0;
+}
+
+static inline void tid_group_remove(struct tid_group *grp,
+ struct exp_tid_set *set)
+{
+ list_del_init(&grp->list);
+ set->count--;
+}
+
+static inline void tid_group_add_tail(struct tid_group *grp,
+ struct exp_tid_set *set)
+{
+ list_add_tail(&grp->list, &set->list);
+ set->count++;
+}
+
+static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
+{
+ struct tid_group *grp =
+ list_first_entry(&set->list, struct tid_group, list);
+ list_del_init(&grp->list);
+ set->count--;
+ return grp;
+}
+
+static inline void tid_group_move(struct tid_group *group,
+ struct exp_tid_set *s1,
+ struct exp_tid_set *s2)
+{
+ tid_group_remove(group, s1);
+ tid_group_add_tail(group, s2);
+}
+
+/*
+ * Initialize context and file private data needed for Expected
+ * receive caching. This needs to be done after the context has
+ * been configured with the eager/expected RcvEntry counts.
+ */
+int hfi1_user_exp_rcv_init(struct file *fp)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned tidbase;
+ int i, ret = 0;
+
+ spin_lock_init(&fd->tid_lock);
+ spin_lock_init(&fd->invalid_lock);
+
+ if (!uctxt->subctxt_cnt || !fd->subctxt) {
+ exp_tid_group_init(&uctxt->tid_group_list);
+ exp_tid_group_init(&uctxt->tid_used_list);
+ exp_tid_group_init(&uctxt->tid_full_list);
+
+ tidbase = uctxt->expected_base;
+ for (i = 0; i < uctxt->expected_count /
+ dd->rcv_entries.group_size; i++) {
+ struct tid_group *grp;
+
+ grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+ if (!grp) {
+ /*
+ * If we fail here, the groups already
+ * allocated will be freed by the close
+ * call.
+ */
+ ret = -ENOMEM;
+ goto done;
+ }
+ grp->size = dd->rcv_entries.group_size;
+ grp->base = tidbase;
+ tid_group_add_tail(grp, &uctxt->tid_group_list);
+ tidbase += dd->rcv_entries.group_size;
+ }
+ }
+
+ fd->entry_to_rb = kcalloc(uctxt->expected_count,
+ sizeof(struct rb_node *),
+ GFP_KERNEL);
+ if (!fd->entry_to_rb)
+ return -ENOMEM;
+
+ if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
+ fd->invalid_tid_idx = 0;
+ fd->invalid_tids = kzalloc(uctxt->expected_count *
+ sizeof(u32), GFP_KERNEL);
+ if (!fd->invalid_tids) {
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ /*
+ * Register MMU notifier callbacks. If the registration
+ * fails, continue without TID caching for this context.
+ */
+ ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
+ dd->pport->hfi1_wq,
+ &fd->handler);
+ if (ret) {
+ dd_dev_info(dd,
+ "Failed MMU notifier registration %d\n",
+ ret);
+ ret = 0;
+ }
+ }
+
+ /*
+ * PSM does not have a good way to separate, count, and
+ * effectively enforce a limit on RcvArray entries used by
+ * subctxts (when context sharing is used) when TID caching
+ * is enabled. To help with that, we calculate a per-process
+ * RcvArray entry share and enforce that.
+ * If TID caching is not in use, PSM deals with usage on its
+ * own. In that case, we allow any subctxt to take all of the
+ * entries.
+ *
+ * Make sure that we set the tid counts only after successful
+ * init.
+ */
+ spin_lock(&fd->tid_lock);
+ if (uctxt->subctxt_cnt && fd->handler) {
+ u16 remainder;
+
+ fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
+ remainder = uctxt->expected_count % uctxt->subctxt_cnt;
+ if (remainder && fd->subctxt < remainder)
+ fd->tid_limit++;
+ } else {
+ fd->tid_limit = uctxt->expected_count;
+ }
+ spin_unlock(&fd->tid_lock);
+done:
+ return ret;
+}
+
+int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
+{
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct tid_group *grp, *gptr;
+
+ if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
+ return 0;
+ /*
+ * The notifier would have been removed when the process'es mm
+ * was freed.
+ */
+ if (fd->handler)
+ hfi1_mmu_rb_unregister(fd->handler);
+
+ kfree(fd->invalid_tids);
+
+ if (!uctxt->cnt) {
+ if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
+ unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
+ if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
+ unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
+ list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
+ list) {
+ list_del_init(&grp->list);
+ kfree(grp);
+ }
+ hfi1_clear_tids(uctxt);
+ }
+
+ kfree(fd->entry_to_rb);
+ return 0;
+}
+
+/*
+ * Write an "empty" RcvArray entry.
+ * This function exists so the TID registaration code can use it
+ * to write to unused/unneeded entries and still take advantage
+ * of the WC performance improvements. The HFI will ignore this
+ * write to the RcvArray entry.
+ */
+static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
+{
+ /*
+ * Doing the WC fill writes only makes sense if the device is
+ * present and the RcvArray has been mapped as WC memory.
+ */
+ if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
+ writeq(0, dd->rcvarray_wc + (index * 8));
+}
+
+/*
+ * RcvArray entry allocation for Expected Receives is done by the
+ * following algorithm:
+ *
+ * The context keeps 3 lists of groups of RcvArray entries:
+ * 1. List of empty groups - tid_group_list
+ * This list is created during user context creation and
+ * contains elements which describe sets (of 8) of empty
+ * RcvArray entries.
+ * 2. List of partially used groups - tid_used_list
+ * This list contains sets of RcvArray entries which are
+ * not completely used up. Another mapping request could
+ * use some of all of the remaining entries.
+ * 3. List of full groups - tid_full_list
+ * This is the list where sets that are completely used
+ * up go.
+ *
+ * An attempt to optimize the usage of RcvArray entries is
+ * made by finding all sets of physically contiguous pages in a
+ * user's buffer.
+ * These physically contiguous sets are further split into
+ * sizes supported by the receive engine of the HFI. The
+ * resulting sets of pages are stored in struct tid_pageset,
+ * which describes the sets as:
+ * * .count - number of pages in this set
+ * * .idx - starting index into struct page ** array
+ * of this set
+ *
+ * From this point on, the algorithm deals with the page sets
+ * described above. The number of pagesets is divided by the
+ * RcvArray group size to produce the number of full groups
+ * needed.
+ *
+ * Groups from the 3 lists are manipulated using the following
+ * rules:
+ * 1. For each set of 8 pagesets, a complete group from
+ * tid_group_list is taken, programmed, and moved to
+ * the tid_full_list list.
+ * 2. For all remaining pagesets:
+ * 2.1 If the tid_used_list is empty and the tid_group_list
+ * is empty, stop processing pageset and return only
+ * what has been programmed up to this point.
+ * 2.2 If the tid_used_list is empty and the tid_group_list
+ * is not empty, move a group from tid_group_list to
+ * tid_used_list.
+ * 2.3 For each group is tid_used_group, program as much as
+ * can fit into the group. If the group becomes fully
+ * used, move it to tid_full_list.
+ */
+int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+ int ret = 0, need_group = 0, pinned;
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd = uctxt->dd;
+ unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
+ tididx = 0, mapped, mapped_pages = 0;
+ unsigned long vaddr = tinfo->vaddr;
+ struct page **pages = NULL;
+ u32 *tidlist = NULL;
+ struct tid_pageset *pagesets = NULL;
+
+ /* Get the number of pages the user buffer spans */
+ npages = num_user_pages(vaddr, tinfo->length);
+ if (!npages)
+ return -EINVAL;
+
+ if (npages > uctxt->expected_count) {
+ dd_dev_err(dd, "Expected buffer too big\n");
+ return -EINVAL;
+ }
+
+ /* Verify that access is OK for the user buffer */
+ if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+ npages * PAGE_SIZE)) {
+ dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
+ (void *)vaddr, npages);
+ return -EFAULT;
+ }
+
+ pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
+ GFP_KERNEL);
+ if (!pagesets)
+ return -ENOMEM;
+
+ /* Allocate the array of struct page pointers needed for pinning */
+ pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ /*
+ * Pin all the pages of the user buffer. If we can't pin all the
+ * pages, accept the amount pinned so far and program only that.
+ * User space knows how to deal with partially programmed buffers.
+ */
+ if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
+ if (pinned <= 0) {
+ ret = pinned;
+ goto bail;
+ }
+ fd->tid_n_pinned += npages;
+
+ /* Find sets of physically contiguous pages */
+ npagesets = find_phys_blocks(pages, pinned, pagesets);
+
+ /*
+ * We don't need to access this under a lock since tid_used is per
+ * process and the same process cannot be in hfi1_user_exp_rcv_clear()
+ * and hfi1_user_exp_rcv_setup() at the same time.
+ */
+ spin_lock(&fd->tid_lock);
+ if (fd->tid_used + npagesets > fd->tid_limit)
+ pageset_count = fd->tid_limit - fd->tid_used;
+ else
+ pageset_count = npagesets;
+ spin_unlock(&fd->tid_lock);
+
+ if (!pageset_count)
+ goto bail;
+
+ ngroups = pageset_count / dd->rcv_entries.group_size;
+ tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
+ if (!tidlist) {
+ ret = -ENOMEM;
+ goto nomem;
+ }
+
+ tididx = 0;
+
+ /*
+ * From this point on, we are going to be using shared (between master
+ * and subcontexts) context resources. We need to take the lock.
+ */
+ mutex_lock(&uctxt->exp_lock);
+ /*
+ * The first step is to program the RcvArray entries which are complete
+ * groups.
+ */
+ while (ngroups && uctxt->tid_group_list.count) {
+ struct tid_group *grp =
+ tid_group_pop(&uctxt->tid_group_list);
+
+ ret = program_rcvarray(fp, vaddr, grp, pagesets,
+ pageidx, dd->rcv_entries.group_size,
+ pages, tidlist, &tididx, &mapped);
+ /*
+ * If there was a failure to program the RcvArray
+ * entries for the entire group, reset the grp fields
+ * and add the grp back to the free group list.
+ */
+ if (ret <= 0) {
+ tid_group_add_tail(grp, &uctxt->tid_group_list);
+ hfi1_cdbg(TID,
+ "Failed to program RcvArray group %d", ret);
+ goto unlock;
+ }
+
+ tid_group_add_tail(grp, &uctxt->tid_full_list);
+ ngroups--;
+ pageidx += ret;
+ mapped_pages += mapped;
+ }
+
+ while (pageidx < pageset_count) {
+ struct tid_group *grp, *ptr;
+ /*
+ * If we don't have any partially used tid groups, check
+ * if we have empty groups. If so, take one from there and
+ * put in the partially used list.
+ */
+ if (!uctxt->tid_used_list.count || need_group) {
+ if (!uctxt->tid_group_list.count)
+ goto unlock;
+
+ grp = tid_group_pop(&uctxt->tid_group_list);
+ tid_group_add_tail(grp, &uctxt->tid_used_list);
+ need_group = 0;
+ }
+ /*
+ * There is an optimization opportunity here - instead of
+ * fitting as many page sets as we can, check for a group
+ * later on in the list that could fit all of them.
+ */
+ list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
+ list) {
+ unsigned use = min_t(unsigned, pageset_count - pageidx,
+ grp->size - grp->used);
+
+ ret = program_rcvarray(fp, vaddr, grp, pagesets,
+ pageidx, use, pages, tidlist,
+ &tididx, &mapped);
+ if (ret < 0) {
+ hfi1_cdbg(TID,
+ "Failed to program RcvArray entries %d",
+ ret);
+ ret = -EFAULT;
+ goto unlock;
+ } else if (ret > 0) {
+ if (grp->used == grp->size)
+ tid_group_move(grp,
+ &uctxt->tid_used_list,
+ &uctxt->tid_full_list);
+ pageidx += ret;
+ mapped_pages += mapped;
+ need_group = 0;
+ /* Check if we are done so we break out early */
+ if (pageidx >= pageset_count)
+ break;
+ } else if (WARN_ON(ret == 0)) {
+ /*
+ * If ret is 0, we did not program any entries
+ * into this group, which can only happen if
+ * we've screwed up the accounting somewhere.
+ * Warn and try to continue.
+ */
+ need_group = 1;
+ }
+ }
+ }
+unlock:
+ mutex_unlock(&uctxt->exp_lock);
+nomem:
+ hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
+ mapped_pages, ret);
+ if (tididx) {
+ spin_lock(&fd->tid_lock);
+ fd->tid_used += tididx;
+ spin_unlock(&fd->tid_lock);
+ tinfo->tidcnt = tididx;
+ tinfo->length = mapped_pages * PAGE_SIZE;
+
+ if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
+ tidlist, sizeof(tidlist[0]) * tididx)) {
+ /*
+ * On failure to copy to the user level, we need to undo
+ * everything done so far so we don't leak resources.
+ */
+ tinfo->tidlist = (unsigned long)&tidlist;
+ hfi1_user_exp_rcv_clear(fp, tinfo);
+ tinfo->tidlist = 0;
+ ret = -EFAULT;
+ goto bail;
+ }
+ }
+
+ /*
+ * If not everything was mapped (due to insufficient RcvArray entries,
+ * for example), unpin all unmapped pages so we can pin them nex time.
+ */
+ if (mapped_pages != pinned) {
+ hfi1_release_user_pages(fd->mm, &pages[mapped_pages],
+ pinned - mapped_pages,
+ false);
+ fd->tid_n_pinned -= pinned - mapped_pages;
+ }
+bail:
+ kfree(pagesets);
+ kfree(pages);
+ kfree(tidlist);
+ return ret > 0 ? 0 : ret;
+}
+
+int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+ int ret = 0;
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ u32 *tidinfo;
+ unsigned tididx;
+
+ tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL);
+ if (!tidinfo)
+ return -ENOMEM;
+
+ if (copy_from_user(tidinfo, (void __user *)(unsigned long)
+ tinfo->tidlist, sizeof(tidinfo[0]) *
+ tinfo->tidcnt)) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+ mutex_lock(&uctxt->exp_lock);
+ for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
+ ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL);
+ if (ret) {
+ hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
+ ret);
+ break;
+ }
+ }
+ spin_lock(&fd->tid_lock);
+ fd->tid_used -= tididx;
+ spin_unlock(&fd->tid_lock);
+ tinfo->tidcnt = tididx;
+ mutex_unlock(&uctxt->exp_lock);
+done:
+ kfree(tidinfo);
+ return ret;
+}
+
+int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ unsigned long *ev = uctxt->dd->events +
+ (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
+ u32 *array;
+ int ret = 0;
+
+ if (!fd->invalid_tids)
+ return -EINVAL;
+
+ /*
+ * copy_to_user() can sleep, which will leave the invalid_lock
+ * locked and cause the MMU notifier to be blocked on the lock
+ * for a long time.
+ * Copy the data to a local buffer so we can release the lock.
+ */
+ array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
+ if (!array)
+ return -EFAULT;
+
+ spin_lock(&fd->invalid_lock);
+ if (fd->invalid_tid_idx) {
+ memcpy(array, fd->invalid_tids, sizeof(*array) *
+ fd->invalid_tid_idx);
+ memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
+ fd->invalid_tid_idx);
+ tinfo->tidcnt = fd->invalid_tid_idx;
+ fd->invalid_tid_idx = 0;
+ /*
+ * Reset the user flag while still holding the lock.
+ * Otherwise, PSM can miss events.
+ */
+ clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+ } else {
+ tinfo->tidcnt = 0;
+ }
+ spin_unlock(&fd->invalid_lock);
+
+ if (tinfo->tidcnt) {
+ if (copy_to_user((void __user *)tinfo->tidlist,
+ array, sizeof(*array) * tinfo->tidcnt))
+ ret = -EFAULT;
+ }
+ kfree(array);
+
+ return ret;
+}
+
+static u32 find_phys_blocks(struct page **pages, unsigned npages,
+ struct tid_pageset *list)
+{
+ unsigned pagecount, pageidx, setcount = 0, i;
+ unsigned long pfn, this_pfn;
+
+ if (!npages)
+ return 0;
+
+ /*
+ * Look for sets of physically contiguous pages in the user buffer.
+ * This will allow us to optimize Expected RcvArray entry usage by
+ * using the bigger supported sizes.
+ */
+ pfn = page_to_pfn(pages[0]);
+ for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+ this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
+
+ /*
+ * If the pfn's are not sequential, pages are not physically
+ * contiguous.
+ */
+ if (this_pfn != ++pfn) {
+ /*
+ * At this point we have to loop over the set of
+ * physically contiguous pages and break them down it
+ * sizes supported by the HW.
+ * There are two main constraints:
+ * 1. The max buffer size is MAX_EXPECTED_BUFFER.
+ * If the total set size is bigger than that
+ * program only a MAX_EXPECTED_BUFFER chunk.
+ * 2. The buffer size has to be a power of two. If
+ * it is not, round down to the closes power of
+ * 2 and program that size.
+ */
+ while (pagecount) {
+ int maxpages = pagecount;
+ u32 bufsize = pagecount * PAGE_SIZE;
+
+ if (bufsize > MAX_EXPECTED_BUFFER)
+ maxpages =
+ MAX_EXPECTED_BUFFER >>
+ PAGE_SHIFT;
+ else if (!is_power_of_2(bufsize))
+ maxpages =
+ rounddown_pow_of_two(bufsize) >>
+ PAGE_SHIFT;
+
+ list[setcount].idx = pageidx;
+ list[setcount].count = maxpages;
+ pagecount -= maxpages;
+ pageidx += maxpages;
+ setcount++;
+ }
+ pageidx = i;
+ pagecount = 1;
+ pfn = this_pfn;
+ } else {
+ pagecount++;
+ }
+ }
+ return setcount;
+}
+
+/**
+ * program_rcvarray() - program an RcvArray group with receive buffers
+ * @fp: file pointer
+ * @vaddr: starting user virtual address
+ * @grp: RcvArray group
+ * @sets: array of struct tid_pageset holding information on physically
+ * contiguous chunks from the user buffer
+ * @start: starting index into sets array
+ * @count: number of struct tid_pageset's to program
+ * @pages: an array of struct page * for the user buffer
+ * @tidlist: the array of u32 elements when the information about the
+ * programmed RcvArray entries is to be encoded.
+ * @tididx: starting offset into tidlist
+ * @pmapped: (output parameter) number of pages programmed into the RcvArray
+ * entries.
+ *
+ * This function will program up to 'count' number of RcvArray entries from the
+ * group 'grp'. To make best use of write-combining writes, the function will
+ * perform writes to the unused RcvArray entries which will be ignored by the
+ * HW. Each RcvArray entry will be programmed with a physically contiguous
+ * buffer chunk from the user's virtual buffer.
+ *
+ * Return:
+ * -EINVAL if the requested count is larger than the size of the group,
+ * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
+ * number of RcvArray entries programmed.
+ */
+static int program_rcvarray(struct file *fp, unsigned long vaddr,
+ struct tid_group *grp,
+ struct tid_pageset *sets,
+ unsigned start, u16 count, struct page **pages,
+ u32 *tidlist, unsigned *tididx, unsigned *pmapped)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd = uctxt->dd;
+ u16 idx;
+ u32 tidinfo = 0, rcventry, useidx = 0;
+ int mapped = 0;
+
+ /* Count should never be larger than the group size */
+ if (count > grp->size)
+ return -EINVAL;
+
+ /* Find the first unused entry in the group */
+ for (idx = 0; idx < grp->size; idx++) {
+ if (!(grp->map & (1 << idx))) {
+ useidx = idx;
+ break;
+ }
+ rcv_array_wc_fill(dd, grp->base + idx);
+ }
+
+ idx = 0;
+ while (idx < count) {
+ u16 npages, pageidx, setidx = start + idx;
+ int ret = 0;
+
+ /*
+ * If this entry in the group is used, move to the next one.
+ * If we go past the end of the group, exit the loop.
+ */
+ if (useidx >= grp->size) {
+ break;
+ } else if (grp->map & (1 << useidx)) {
+ rcv_array_wc_fill(dd, grp->base + useidx);
+ useidx++;
+ continue;
+ }
+
+ rcventry = grp->base + useidx;
+ npages = sets[setidx].count;
+ pageidx = sets[setidx].idx;
+
+ ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE),
+ rcventry, grp, pages + pageidx,
+ npages);
+ if (ret)
+ return ret;
+ mapped += npages;
+
+ tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
+ EXP_TID_SET(LEN, npages);
+ tidlist[(*tididx)++] = tidinfo;
+ grp->used++;
+ grp->map |= 1 << useidx++;
+ idx++;
+ }
+
+ /* Fill the rest of the group with "blank" writes */
+ for (; useidx < grp->size; useidx++)
+ rcv_array_wc_fill(dd, grp->base + useidx);
+ *pmapped = mapped;
+ return idx;
+}
+
+static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
+ u32 rcventry, struct tid_group *grp,
+ struct page **pages, unsigned npages)
+{
+ int ret;
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct tid_rb_node *node;
+ struct hfi1_devdata *dd = uctxt->dd;
+ dma_addr_t phys;
+
+ /*
+ * Allocate the node first so we can handle a potential
+ * failure before we've programmed anything.
+ */
+ node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
+ GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ phys = pci_map_single(dd->pcidev,
+ __va(page_to_phys(pages[0])),
+ npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
+ if (dma_mapping_error(&dd->pcidev->dev, phys)) {
+ dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
+ phys);
+ kfree(node);
+ return -EFAULT;
+ }
+
+ node->mmu.addr = vaddr;
+ node->mmu.len = npages * PAGE_SIZE;
+ node->phys = page_to_phys(pages[0]);
+ node->npages = npages;
+ node->rcventry = rcventry;
+ node->dma_addr = phys;
+ node->grp = grp;
+ node->freed = false;
+ memcpy(node->pages, pages, sizeof(struct page *) * npages);
+
+ if (!fd->handler)
+ ret = tid_rb_insert(fd, &node->mmu);
+ else
+ ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
+
+ if (ret) {
+ hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
+ node->rcventry, node->mmu.addr, node->phys, ret);
+ pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
+ PCI_DMA_FROMDEVICE);
+ kfree(node);
+ return -EFAULT;
+ }
+ hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
+ trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
+ node->mmu.addr, node->phys, phys);
+ return 0;
+}
+
+static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
+ struct tid_group **grp)
+{
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd = uctxt->dd;
+ struct tid_rb_node *node;
+ u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
+ u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
+
+ if (tididx >= uctxt->expected_count) {
+ dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
+ tididx, uctxt->ctxt);
+ return -EINVAL;
+ }
+
+ if (tidctrl == 0x3)
+ return -EINVAL;
+
+ rcventry = tididx + (tidctrl - 1);
+
+ node = fd->entry_to_rb[rcventry];
+ if (!node || node->rcventry != (uctxt->expected_base + rcventry))
+ return -EBADF;
+
+ if (grp)
+ *grp = node->grp;
+
+ if (!fd->handler)
+ cacheless_tid_rb_remove(fd, node);
+ else
+ hfi1_mmu_rb_remove(fd->handler, &node->mmu);
+
+ return 0;
+}
+
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
+{
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_devdata *dd = uctxt->dd;
+
+ trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
+ node->npages, node->mmu.addr, node->phys,
+ node->dma_addr);
+
+ hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
+ /*
+ * Make sure device has seen the write before we unpin the
+ * pages.
+ */
+ flush_wc();
+
+ pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
+ PCI_DMA_FROMDEVICE);
+ hfi1_release_user_pages(fd->mm, node->pages, node->npages, true);
+ fd->tid_n_pinned -= node->npages;
+
+ node->grp->used--;
+ node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
+
+ if (node->grp->used == node->grp->size - 1)
+ tid_group_move(node->grp, &uctxt->tid_full_list,
+ &uctxt->tid_used_list);
+ else if (!node->grp->used)
+ tid_group_move(node->grp, &uctxt->tid_used_list,
+ &uctxt->tid_group_list);
+ kfree(node);
+}
+
+/*
+ * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
+ * clearing nodes in the non-cached case.
+ */
+static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
+ struct exp_tid_set *set,
+ struct hfi1_filedata *fd)
+{
+ struct tid_group *grp, *ptr;
+ int i;
+
+ list_for_each_entry_safe(grp, ptr, &set->list, list) {
+ list_del_init(&grp->list);
+
+ for (i = 0; i < grp->size; i++) {
+ if (grp->map & (1 << i)) {
+ u16 rcventry = grp->base + i;
+ struct tid_rb_node *node;
+
+ node = fd->entry_to_rb[rcventry -
+ uctxt->expected_base];
+ if (!node || node->rcventry != rcventry)
+ continue;
+
+ cacheless_tid_rb_remove(fd, node);
+ }
+ }
+ }
+}
+
+/*
+ * Always return 0 from this function. A non-zero return indicates that the
+ * remove operation will be called and that memory should be unpinned.
+ * However, the driver cannot unpin out from under PSM. Instead, retain the
+ * memory (by returning 0) and inform PSM that the memory is going away. PSM
+ * will call back later when it has removed the memory from its list.
+ */
+static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
+{
+ struct hfi1_filedata *fdata = arg;
+ struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+ struct tid_rb_node *node =
+ container_of(mnode, struct tid_rb_node, mmu);
+
+ if (node->freed)
+ return 0;
+
+ trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
+ node->rcventry, node->npages, node->dma_addr);
+ node->freed = true;
+
+ spin_lock(&fdata->invalid_lock);
+ if (fdata->invalid_tid_idx < uctxt->expected_count) {
+ fdata->invalid_tids[fdata->invalid_tid_idx] =
+ rcventry2tidinfo(node->rcventry - uctxt->expected_base);
+ fdata->invalid_tids[fdata->invalid_tid_idx] |=
+ EXP_TID_SET(LEN, node->npages);
+ if (!fdata->invalid_tid_idx) {
+ unsigned long *ev;
+
+ /*
+ * hfi1_set_uevent_bits() sets a user event flag
+ * for all processes. Because calling into the
+ * driver to process TID cache invalidations is
+ * expensive and TID cache invalidations are
+ * handled on a per-process basis, we can
+ * optimize this to set the flag only for the
+ * process in question.
+ */
+ ev = uctxt->dd->events +
+ (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
+ HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
+ set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+ }
+ fdata->invalid_tid_idx++;
+ }
+ spin_unlock(&fdata->invalid_lock);
+ return 0;
+}
+
+static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
+{
+ struct hfi1_filedata *fdata = arg;
+ struct tid_rb_node *tnode =
+ container_of(node, struct tid_rb_node, mmu);
+ u32 base = fdata->uctxt->expected_base;
+
+ fdata->entry_to_rb[tnode->rcventry - base] = tnode;
+ return 0;
+}
+
+static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
+ struct tid_rb_node *tnode)
+{
+ u32 base = fdata->uctxt->expected_base;
+
+ fdata->entry_to_rb[tnode->rcventry - base] = NULL;
+ clear_tid_node(fdata, tnode);
+}
+
+static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
+{
+ struct hfi1_filedata *fdata = arg;
+ struct tid_rb_node *tnode =
+ container_of(node, struct tid_rb_node, mmu);
+
+ cacheless_tid_rb_remove(fdata, tnode);
+}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
new file mode 100644
index 000000000000..9bc8d9fba87e
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -0,0 +1,79 @@
+#ifndef _HFI1_USER_EXP_RCV_H
+#define _HFI1_USER_EXP_RCV_H
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+#define EXP_TID_TIDLEN_MASK 0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT 0
+#define EXP_TID_TIDCTRL_MASK 0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK 0x3FFULL
+#define EXP_TID_TIDIDX_SHIFT 22
+#define EXP_TID_GET(tid, field) \
+ (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+#define EXP_TID_SET(field, value) \
+ (((value) & EXP_TID_TID##field##_MASK) << \
+ EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) ({ \
+ (tid) &= ~(EXP_TID_TID##field##_MASK << \
+ EXP_TID_TID##field##_SHIFT); \
+ })
+#define EXP_TID_RESET(tid, field, value) do { \
+ EXP_TID_CLEAR(tid, field); \
+ (tid) |= EXP_TID_SET(field, (value)); \
+ } while (0)
+
+int hfi1_user_exp_rcv_init(struct file *);
+int hfi1_user_exp_rcv_free(struct hfi1_filedata *);
+int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *);
+
+#endif /* _HFI1_USER_EXP_RCV_H */
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
new file mode 100644
index 000000000000..20f4ddcac3b0
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+
+static unsigned long cache_size = 256;
+module_param(cache_size, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)");
+
+/*
+ * Determine whether the caller can pin pages.
+ *
+ * This function should be used in the implementation of buffer caches.
+ * The cache implementation should call this function prior to attempting
+ * to pin buffer pages in order to determine whether they should do so.
+ * The function computes cache limits based on the configured ulimit and
+ * cache size. Use of this function is especially important for caches
+ * which are not limited in any other way (e.g. by HW resources) and, thus,
+ * could keeping caching buffers.
+ *
+ */
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
+ u32 nlocked, u32 npages)
+{
+ unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
+ size = (cache_size * (1UL << 20)); /* convert to bytes */
+ unsigned usr_ctxts = dd->num_rcv_contexts - dd->first_user_ctxt;
+ bool can_lock = capable(CAP_IPC_LOCK);
+
+ /*
+ * Calculate per-cache size. The calculation below uses only a quarter
+ * of the available per-context limit. This leaves space for other
+ * pinning. Should we worry about shared ctxts?
+ */
+ cache_limit = (ulimit / usr_ctxts) / 4;
+
+ /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */
+ if (ulimit != (-1UL) && size > cache_limit)
+ size = cache_limit;
+
+ /* Convert to number of pages */
+ size = DIV_ROUND_UP(size, PAGE_SIZE);
+
+ down_read(&mm->mmap_sem);
+ pinned = mm->pinned_vm;
+ up_read(&mm->mmap_sem);
+
+ /* First, check the absolute limit against all pinned pages. */
+ if (pinned + npages >= ulimit && !can_lock)
+ return false;
+
+ return ((nlocked + npages) <= size) || can_lock;
+}
+
+int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages,
+ bool writable, struct page **pages)
+{
+ int ret;
+
+ ret = get_user_pages_fast(vaddr, npages, writable, pages);
+ if (ret < 0)
+ return ret;
+
+ down_write(&mm->mmap_sem);
+ mm->pinned_vm += ret;
+ up_write(&mm->mmap_sem);
+
+ return ret;
+}
+
+void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
+ size_t npages, bool dirty)
+{
+ size_t i;
+
+ for (i = 0; i < npages; i++) {
+ if (dirty)
+ set_page_dirty_lock(p[i]);
+ put_page(p[i]);
+ }
+
+ if (mm) { /* during close after signal, mm can be NULL */
+ down_write(&mm->mmap_sem);
+ mm->pinned_vm -= npages;
+ up_write(&mm->mmap_sem);
+ }
+}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
new file mode 100644
index 000000000000..1694037d1eee
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -0,0 +1,1669 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "sdma.h"
+#include "user_sdma.h"
+#include "verbs.h" /* for the headers */
+#include "common.h" /* for struct hfi1_tid_info */
+#include "trace.h"
+#include "mmu_rb.h"
+
+static uint hfi1_sdma_comp_ring_size = 128;
+module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
+
+/* The maximum number of Data io vectors per message/request */
+#define MAX_VECTORS_PER_REQ 8
+/*
+ * Maximum number of packet to send from each message/request
+ * before moving to the next one.
+ */
+#define MAX_PKTS_PER_QUEUE 16
+
+#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
+
+#define req_opcode(x) \
+ (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_version(x) \
+ (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_iovcnt(x) \
+ (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
+
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define BTH_SEQ_MASK 0x7ffull
+
+/*
+ * Define fields in the KDETH header so we can update the header
+ * template.
+ */
+#define KDETH_OFFSET_SHIFT 0
+#define KDETH_OFFSET_MASK 0x7fff
+#define KDETH_OM_SHIFT 15
+#define KDETH_OM_MASK 0x1
+#define KDETH_TID_SHIFT 16
+#define KDETH_TID_MASK 0x3ff
+#define KDETH_TIDCTRL_SHIFT 26
+#define KDETH_TIDCTRL_MASK 0x3
+#define KDETH_INTR_SHIFT 28
+#define KDETH_INTR_MASK 0x1
+#define KDETH_SH_SHIFT 29
+#define KDETH_SH_MASK 0x1
+#define KDETH_HCRC_UPPER_SHIFT 16
+#define KDETH_HCRC_UPPER_MASK 0xff
+#define KDETH_HCRC_LOWER_SHIFT 24
+#define KDETH_HCRC_LOWER_MASK 0xff
+
+#define AHG_KDETH_INTR_SHIFT 12
+
+#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
+#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
+
+#define KDETH_GET(val, field) \
+ (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
+#define KDETH_SET(dw, field, val) do { \
+ u32 dwval = le32_to_cpu(dw); \
+ dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
+ dwval |= (((val) & KDETH_##field##_MASK) << \
+ KDETH_##field##_SHIFT); \
+ dw = cpu_to_le32(dwval); \
+ } while (0)
+
+#define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \
+ do { \
+ if ((idx) < ARRAY_SIZE((arr))) \
+ (arr)[(idx++)] = sdma_build_ahg_descriptor( \
+ (__force u16)(value), (dw), (bit), \
+ (width)); \
+ else \
+ return -ERANGE; \
+ } while (0)
+
+/* KDETH OM multipliers and switch over point */
+#define KDETH_OM_SMALL 4
+#define KDETH_OM_LARGE 64
+#define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
+
+/* Last packet in the request */
+#define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
+
+/* SDMA request flag bits */
+#define SDMA_REQ_FOR_THREAD 1
+#define SDMA_REQ_SEND_DONE 2
+#define SDMA_REQ_HAVE_AHG 3
+#define SDMA_REQ_HAS_ERROR 4
+#define SDMA_REQ_DONE_ERROR 5
+
+#define SDMA_PKT_Q_INACTIVE BIT(0)
+#define SDMA_PKT_Q_ACTIVE BIT(1)
+#define SDMA_PKT_Q_DEFERRED BIT(2)
+
+/*
+ * Maximum retry attempts to submit a TX request
+ * before putting the process to sleep.
+ */
+#define MAX_DEFER_RETRY_COUNT 1
+
+static unsigned initial_pkt_count = 8;
+
+#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
+
+struct sdma_mmu_node;
+
+struct user_sdma_iovec {
+ struct list_head list;
+ struct iovec iov;
+ /* number of pages in this vector */
+ unsigned npages;
+ /* array of pinned pages for this vector */
+ struct page **pages;
+ /*
+ * offset into the virtual address space of the vector at
+ * which we last left off.
+ */
+ u64 offset;
+ struct sdma_mmu_node *node;
+};
+
+struct sdma_mmu_node {
+ struct mmu_rb_node rb;
+ struct hfi1_user_sdma_pkt_q *pq;
+ atomic_t refcount;
+ struct page **pages;
+ unsigned npages;
+};
+
+/* evict operation argument */
+struct evict_data {
+ u32 cleared; /* count evicted so far */
+ u32 target; /* target count to evict */
+};
+
+struct user_sdma_request {
+ struct sdma_req_info info;
+ struct hfi1_user_sdma_pkt_q *pq;
+ struct hfi1_user_sdma_comp_q *cq;
+ /* This is the original header from user space */
+ struct hfi1_pkt_header hdr;
+ /*
+ * Pointer to the SDMA engine for this request.
+ * Since different request could be on different VLs,
+ * each request will need it's own engine pointer.
+ */
+ struct sdma_engine *sde;
+ u8 ahg_idx;
+ u32 ahg[9];
+ /*
+ * KDETH.Offset (Eager) field
+ * We need to remember the initial value so the headers
+ * can be updated properly.
+ */
+ u32 koffset;
+ /*
+ * KDETH.OFFSET (TID) field
+ * The offset can cover multiple packets, depending on the
+ * size of the TID entry.
+ */
+ u32 tidoffset;
+ /*
+ * KDETH.OM
+ * Remember this because the header template always sets it
+ * to 0.
+ */
+ u8 omfactor;
+ /*
+ * We copy the iovs for this request (based on
+ * info.iovcnt). These are only the data vectors
+ */
+ unsigned data_iovs;
+ /* total length of the data in the request */
+ u32 data_len;
+ /* progress index moving along the iovs array */
+ unsigned iov_idx;
+ struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
+ /* number of elements copied to the tids array */
+ u16 n_tids;
+ /* TID array values copied from the tid_iov vector */
+ u32 *tids;
+ u16 tididx;
+ u32 sent;
+ u64 seqnum;
+ u64 seqcomp;
+ u64 seqsubmitted;
+ struct list_head txps;
+ unsigned long flags;
+ /* status of the last txreq completed */
+ int status;
+};
+
+/*
+ * A single txreq could span up to 3 physical pages when the MTU
+ * is sufficiently large (> 4K). Each of the IOV pointers also
+ * needs it's own set of flags so the vector has been handled
+ * independently of each other.
+ */
+struct user_sdma_txreq {
+ /* Packet header for the txreq */
+ struct hfi1_pkt_header hdr;
+ struct sdma_txreq txreq;
+ struct list_head list;
+ struct user_sdma_request *req;
+ u16 flags;
+ unsigned busycount;
+ u64 seqnum;
+};
+
+#define SDMA_DBG(req, fmt, ...) \
+ hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
+ (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
+ ##__VA_ARGS__)
+#define SDMA_Q_DBG(pq, fmt, ...) \
+ hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
+ (pq)->subctxt, ##__VA_ARGS__)
+
+static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
+static int num_user_pages(const struct iovec *);
+static void user_sdma_txreq_cb(struct sdma_txreq *, int);
+static inline void pq_update(struct hfi1_user_sdma_pkt_q *);
+static void user_sdma_free_request(struct user_sdma_request *, bool);
+static int pin_vector_pages(struct user_sdma_request *,
+ struct user_sdma_iovec *);
+static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned,
+ unsigned);
+static int check_header_template(struct user_sdma_request *,
+ struct hfi1_pkt_header *, u32, u32);
+static int set_txreq_header(struct user_sdma_request *,
+ struct user_sdma_txreq *, u32);
+static int set_txreq_header_ahg(struct user_sdma_request *,
+ struct user_sdma_txreq *, u32);
+static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *,
+ struct hfi1_user_sdma_comp_q *,
+ u16, enum hfi1_sdma_comp_state, int);
+static inline u32 set_pkt_bth_psn(__be32, u8, u32);
+static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
+
+static int defer_packet_queue(
+ struct sdma_engine *,
+ struct iowait *,
+ struct sdma_txreq *,
+ unsigned seq);
+static void activate_packet_queue(struct iowait *, int);
+static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long);
+static int sdma_rb_insert(void *, struct mmu_rb_node *);
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
+ void *arg2, bool *stop);
+static void sdma_rb_remove(void *, struct mmu_rb_node *);
+static int sdma_rb_invalidate(void *, struct mmu_rb_node *);
+
+static struct mmu_rb_ops sdma_rb_ops = {
+ .filter = sdma_rb_filter,
+ .insert = sdma_rb_insert,
+ .evict = sdma_rb_evict,
+ .remove = sdma_rb_remove,
+ .invalidate = sdma_rb_invalidate
+};
+
+static int defer_packet_queue(
+ struct sdma_engine *sde,
+ struct iowait *wait,
+ struct sdma_txreq *txreq,
+ unsigned seq)
+{
+ struct hfi1_user_sdma_pkt_q *pq =
+ container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+ struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
+ struct user_sdma_txreq *tx =
+ container_of(txreq, struct user_sdma_txreq, txreq);
+
+ if (sdma_progress(sde, seq, txreq)) {
+ if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
+ goto eagain;
+ }
+ /*
+ * We are assuming that if the list is enqueued somewhere, it
+ * is to the dmawait list since that is the only place where
+ * it is supposed to be enqueued.
+ */
+ xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
+ write_seqlock(&dev->iowait_lock);
+ if (list_empty(&pq->busy.list))
+ list_add_tail(&pq->busy.list, &sde->dmawait);
+ write_sequnlock(&dev->iowait_lock);
+ return -EBUSY;
+eagain:
+ return -EAGAIN;
+}
+
+static void activate_packet_queue(struct iowait *wait, int reason)
+{
+ struct hfi1_user_sdma_pkt_q *pq =
+ container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+ xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+ wake_up(&wait->wait_dma);
+};
+
+static void sdma_kmem_cache_ctor(void *obj)
+{
+ struct user_sdma_txreq *tx = obj;
+
+ memset(tx, 0, sizeof(*tx));
+}
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
+{
+ struct hfi1_filedata *fd;
+ int ret = 0;
+ unsigned memsize;
+ char buf[64];
+ struct hfi1_devdata *dd;
+ struct hfi1_user_sdma_comp_q *cq;
+ struct hfi1_user_sdma_pkt_q *pq;
+ unsigned long flags;
+
+ if (!uctxt || !fp) {
+ ret = -EBADF;
+ goto done;
+ }
+
+ fd = fp->private_data;
+
+ if (!hfi1_sdma_comp_ring_size) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ dd = uctxt->dd;
+
+ pq = kzalloc(sizeof(*pq), GFP_KERNEL);
+ if (!pq)
+ goto pq_nomem;
+
+ memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
+ pq->reqs = kzalloc(memsize, GFP_KERNEL);
+ if (!pq->reqs)
+ goto pq_reqs_nomem;
+
+ memsize = BITS_TO_LONGS(hfi1_sdma_comp_ring_size) * sizeof(long);
+ pq->req_in_use = kzalloc(memsize, GFP_KERNEL);
+ if (!pq->req_in_use)
+ goto pq_reqs_no_in_use;
+
+ INIT_LIST_HEAD(&pq->list);
+ pq->dd = dd;
+ pq->ctxt = uctxt->ctxt;
+ pq->subctxt = fd->subctxt;
+ pq->n_max_reqs = hfi1_sdma_comp_ring_size;
+ pq->state = SDMA_PKT_Q_INACTIVE;
+ atomic_set(&pq->n_reqs, 0);
+ init_waitqueue_head(&pq->wait);
+ atomic_set(&pq->n_locked, 0);
+ pq->mm = fd->mm;
+
+ iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
+ activate_packet_queue, NULL);
+ pq->reqidx = 0;
+ snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
+ fd->subctxt);
+ pq->txreq_cache = kmem_cache_create(buf,
+ sizeof(struct user_sdma_txreq),
+ L1_CACHE_BYTES,
+ SLAB_HWCACHE_ALIGN,
+ sdma_kmem_cache_ctor);
+ if (!pq->txreq_cache) {
+ dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
+ uctxt->ctxt);
+ goto pq_txreq_nomem;
+ }
+ fd->pq = pq;
+ cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+ if (!cq)
+ goto cq_nomem;
+
+ memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size);
+ cq->comps = vmalloc_user(memsize);
+ if (!cq->comps)
+ goto cq_comps_nomem;
+
+ cq->nentries = hfi1_sdma_comp_ring_size;
+ fd->cq = cq;
+
+ ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
+ &pq->handler);
+ if (ret) {
+ dd_dev_err(dd, "Failed to register with MMU %d", ret);
+ goto done;
+ }
+
+ spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+ list_add(&pq->list, &uctxt->sdma_queues);
+ spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+ goto done;
+
+cq_comps_nomem:
+ kfree(cq);
+cq_nomem:
+ kmem_cache_destroy(pq->txreq_cache);
+pq_txreq_nomem:
+ kfree(pq->req_in_use);
+pq_reqs_no_in_use:
+ kfree(pq->reqs);
+pq_reqs_nomem:
+ kfree(pq);
+ fd->pq = NULL;
+pq_nomem:
+ ret = -ENOMEM;
+done:
+ return ret;
+}
+
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
+{
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_user_sdma_pkt_q *pq;
+ unsigned long flags;
+
+ hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
+ uctxt->ctxt, fd->subctxt);
+ pq = fd->pq;
+ if (pq) {
+ if (pq->handler)
+ hfi1_mmu_rb_unregister(pq->handler);
+ spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+ if (!list_empty(&pq->list))
+ list_del_init(&pq->list);
+ spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+ iowait_sdma_drain(&pq->busy);
+ /* Wait until all requests have been freed. */
+ wait_event_interruptible(
+ pq->wait,
+ (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
+ kfree(pq->reqs);
+ kfree(pq->req_in_use);
+ kmem_cache_destroy(pq->txreq_cache);
+ kfree(pq);
+ fd->pq = NULL;
+ }
+ if (fd->cq) {
+ vfree(fd->cq->comps);
+ kfree(fd->cq);
+ fd->cq = NULL;
+ }
+ return 0;
+}
+
+static u8 dlid_to_selector(u16 dlid)
+{
+ static u8 mapping[256];
+ static int initialized;
+ static u8 next;
+ int hash;
+
+ if (!initialized) {
+ memset(mapping, 0xFF, 256);
+ initialized = 1;
+ }
+
+ hash = ((dlid >> 8) ^ dlid) & 0xFF;
+ if (mapping[hash] == 0xFF) {
+ mapping[hash] = next;
+ next = (next + 1) & 0x7F;
+ }
+
+ return mapping[hash];
+}
+
+int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
+ unsigned long dim, unsigned long *count)
+{
+ int ret = 0, i;
+ struct hfi1_filedata *fd = fp->private_data;
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+ struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+ struct hfi1_user_sdma_comp_q *cq = fd->cq;
+ struct hfi1_devdata *dd = pq->dd;
+ unsigned long idx = 0;
+ u8 pcount = initial_pkt_count;
+ struct sdma_req_info info;
+ struct user_sdma_request *req;
+ u8 opcode, sc, vl;
+ int req_queued = 0;
+ u16 dlid;
+ u8 selector;
+
+ if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
+ hfi1_cdbg(
+ SDMA,
+ "[%u:%u:%u] First vector not big enough for header %lu/%lu",
+ dd->unit, uctxt->ctxt, fd->subctxt,
+ iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
+ return -EINVAL;
+ }
+ ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
+ if (ret) {
+ hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
+ dd->unit, uctxt->ctxt, fd->subctxt, ret);
+ return -EFAULT;
+ }
+
+ trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
+ (u16 *)&info);
+
+ if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
+ hfi1_cdbg(SDMA,
+ "[%u:%u:%u:%u] Invalid comp index",
+ dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
+ return -EINVAL;
+ }
+
+ /*
+ * Sanity check the header io vector count. Need at least 1 vector
+ * (header) and cannot be larger than the actual io vector count.
+ */
+ if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
+ hfi1_cdbg(SDMA,
+ "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
+ dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
+ req_iovcnt(info.ctrl), dim);
+ return -EINVAL;
+ }
+
+ if (!info.fragsize) {
+ hfi1_cdbg(SDMA,
+ "[%u:%u:%u:%u] Request does not specify fragsize",
+ dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
+ return -EINVAL;
+ }
+
+ /* Try to claim the request. */
+ if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
+ hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
+ dd->unit, uctxt->ctxt, fd->subctxt,
+ info.comp_idx);
+ return -EBADSLT;
+ }
+ /*
+ * All safety checks have been done and this request has been claimed.
+ */
+ hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
+ uctxt->ctxt, fd->subctxt, info.comp_idx);
+ req = pq->reqs + info.comp_idx;
+ memset(req, 0, sizeof(*req));
+ req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
+ req->pq = pq;
+ req->cq = cq;
+ req->status = -1;
+ INIT_LIST_HEAD(&req->txps);
+
+ memcpy(&req->info, &info, sizeof(info));
+
+ if (req_opcode(info.ctrl) == EXPECTED) {
+ /* expected must have a TID info and at least one data vector */
+ if (req->data_iovs < 2) {
+ SDMA_DBG(req,
+ "Not enough vectors for expected request");
+ ret = -EINVAL;
+ goto free_req;
+ }
+ req->data_iovs--;
+ }
+
+ if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
+ SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
+ MAX_VECTORS_PER_REQ);
+ ret = -EINVAL;
+ goto free_req;
+ }
+ /* Copy the header from the user buffer */
+ ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
+ sizeof(req->hdr));
+ if (ret) {
+ SDMA_DBG(req, "Failed to copy header template (%d)", ret);
+ ret = -EFAULT;
+ goto free_req;
+ }
+
+ /* If Static rate control is not enabled, sanitize the header. */
+ if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
+ req->hdr.pbc[2] = 0;
+
+ /* Validate the opcode. Do not trust packets from user space blindly. */
+ opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
+ if ((opcode & USER_OPCODE_CHECK_MASK) !=
+ USER_OPCODE_CHECK_VAL) {
+ SDMA_DBG(req, "Invalid opcode (%d)", opcode);
+ ret = -EINVAL;
+ goto free_req;
+ }
+ /*
+ * Validate the vl. Do not trust packets from user space blindly.
+ * VL comes from PBC, SC comes from LRH, and the VL needs to
+ * match the SC look up.
+ */
+ vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
+ sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
+ (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
+ if (vl >= dd->pport->vls_operational ||
+ vl != sc_to_vlt(dd, sc)) {
+ SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
+ ret = -EINVAL;
+ goto free_req;
+ }
+
+ /* Checking P_KEY for requests from user-space */
+ if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
+ PKEY_CHECK_INVALID)) {
+ ret = -EINVAL;
+ goto free_req;
+ }
+
+ /*
+ * Also should check the BTH.lnh. If it says the next header is GRH then
+ * the RXE parsing will be off and will land in the middle of the KDETH
+ * or miss it entirely.
+ */
+ if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
+ SDMA_DBG(req, "User tried to pass in a GRH");
+ ret = -EINVAL;
+ goto free_req;
+ }
+
+ req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
+ /*
+ * Calculate the initial TID offset based on the values of
+ * KDETH.OFFSET and KDETH.OM that are passed in.
+ */
+ req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
+ (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+ KDETH_OM_LARGE : KDETH_OM_SMALL);
+ SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
+ idx++;
+
+ /* Save all the IO vector structures */
+ for (i = 0; i < req->data_iovs; i++) {
+ INIT_LIST_HEAD(&req->iovs[i].list);
+ memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
+ ret = pin_vector_pages(req, &req->iovs[i]);
+ if (ret) {
+ req->status = ret;
+ goto free_req;
+ }
+ req->data_len += req->iovs[i].iov.iov_len;
+ }
+ SDMA_DBG(req, "total data length %u", req->data_len);
+
+ if (pcount > req->info.npkts)
+ pcount = req->info.npkts;
+ /*
+ * Copy any TID info
+ * User space will provide the TID info only when the
+ * request type is EXPECTED. This is true even if there is
+ * only one packet in the request and the header is already
+ * setup. The reason for the singular TID case is that the
+ * driver needs to perform safety checks.
+ */
+ if (req_opcode(req->info.ctrl) == EXPECTED) {
+ u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
+
+ if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
+ ret = -EINVAL;
+ goto free_req;
+ }
+ req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
+ if (!req->tids) {
+ ret = -ENOMEM;
+ goto free_req;
+ }
+ /*
+ * We have to copy all of the tids because they may vary
+ * in size and, therefore, the TID count might not be
+ * equal to the pkt count. However, there is no way to
+ * tell at this point.
+ */
+ ret = copy_from_user(req->tids, iovec[idx].iov_base,
+ ntids * sizeof(*req->tids));
+ if (ret) {
+ SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
+ ntids, ret);
+ ret = -EFAULT;
+ goto free_req;
+ }
+ req->n_tids = ntids;
+ idx++;
+ }
+
+ dlid = be16_to_cpu(req->hdr.lrh[1]);
+ selector = dlid_to_selector(dlid);
+
+ /* Have to select the engine */
+ req->sde = sdma_select_engine_vl(dd,
+ (u32)(uctxt->ctxt + fd->subctxt +
+ selector),
+ vl);
+ if (!req->sde || !sdma_running(req->sde)) {
+ ret = -ECOMM;
+ goto free_req;
+ }
+
+ /* We don't need an AHG entry if the request contains only one packet */
+ if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
+ int ahg = sdma_ahg_alloc(req->sde);
+
+ if (likely(ahg >= 0)) {
+ req->ahg_idx = (u8)ahg;
+ set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
+ }
+ }
+
+ set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
+ atomic_inc(&pq->n_reqs);
+ req_queued = 1;
+ /* Send the first N packets in the request to buy us some time */
+ ret = user_sdma_send_pkts(req, pcount);
+ if (unlikely(ret < 0 && ret != -EBUSY)) {
+ req->status = ret;
+ goto free_req;
+ }
+
+ /*
+ * It is possible that the SDMA engine would have processed all the
+ * submitted packets by the time we get here. Therefore, only set
+ * packet queue state to ACTIVE if there are still uncompleted
+ * requests.
+ */
+ if (atomic_read(&pq->n_reqs))
+ xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+
+ /*
+ * This is a somewhat blocking send implementation.
+ * The driver will block the caller until all packets of the
+ * request have been submitted to the SDMA engine. However, it
+ * will not wait for send completions.
+ */
+ while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
+ ret = user_sdma_send_pkts(req, pcount);
+ if (ret < 0) {
+ if (ret != -EBUSY) {
+ req->status = ret;
+ set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+ if (ACCESS_ONCE(req->seqcomp) ==
+ req->seqsubmitted - 1)
+ goto free_req;
+ return ret;
+ }
+ wait_event_interruptible_timeout(
+ pq->busy.wait_dma,
+ (pq->state == SDMA_PKT_Q_ACTIVE),
+ msecs_to_jiffies(
+ SDMA_IOWAIT_TIMEOUT));
+ }
+ }
+ *count += idx;
+ return 0;
+free_req:
+ user_sdma_free_request(req, true);
+ if (req_queued)
+ pq_update(pq);
+ set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
+ return ret;
+}
+
+static inline u32 compute_data_length(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx)
+{
+ /*
+ * Determine the proper size of the packet data.
+ * The size of the data of the first packet is in the header
+ * template. However, it includes the header and ICRC, which need
+ * to be subtracted.
+ * The minimum representable packet data length in a header is 4 bytes,
+ * therefore, when the data length request is less than 4 bytes, there's
+ * only one packet, and the packet data length is equal to that of the
+ * request data length.
+ * The size of the remaining packets is the minimum of the frag
+ * size (MTU) or remaining data in the request.
+ */
+ u32 len;
+
+ if (!req->seqnum) {
+ if (req->data_len < sizeof(u32))
+ len = req->data_len;
+ else
+ len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+ (sizeof(tx->hdr) - 4));
+ } else if (req_opcode(req->info.ctrl) == EXPECTED) {
+ u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
+ PAGE_SIZE;
+ /*
+ * Get the data length based on the remaining space in the
+ * TID pair.
+ */
+ len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
+ /* If we've filled up the TID pair, move to the next one. */
+ if (unlikely(!len) && ++req->tididx < req->n_tids &&
+ req->tids[req->tididx]) {
+ tidlen = EXP_TID_GET(req->tids[req->tididx],
+ LEN) * PAGE_SIZE;
+ req->tidoffset = 0;
+ len = min_t(u32, tidlen, req->info.fragsize);
+ }
+ /*
+ * Since the TID pairs map entire pages, make sure that we
+ * are not going to try to send more data that we have
+ * remaining.
+ */
+ len = min(len, req->data_len - req->sent);
+ } else {
+ len = min(req->data_len - req->sent, (u32)req->info.fragsize);
+ }
+ SDMA_DBG(req, "Data Length = %u", len);
+ return len;
+}
+
+static inline u32 pad_len(u32 len)
+{
+ if (len & (sizeof(u32) - 1))
+ len += sizeof(u32) - (len & (sizeof(u32) - 1));
+ return len;
+}
+
+static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
+{
+ /* (Size of complete header - size of PBC) + 4B ICRC + data length */
+ return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
+}
+
+static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+{
+ int ret = 0;
+ unsigned npkts = 0;
+ struct user_sdma_txreq *tx = NULL;
+ struct hfi1_user_sdma_pkt_q *pq = NULL;
+ struct user_sdma_iovec *iovec = NULL;
+
+ if (!req->pq)
+ return -EINVAL;
+
+ pq = req->pq;
+
+ /* If tx completion has reported an error, we are done. */
+ if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
+ set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+ return -EFAULT;
+ }
+
+ /*
+ * Check if we might have sent the entire request already
+ */
+ if (unlikely(req->seqnum == req->info.npkts)) {
+ if (!list_empty(&req->txps))
+ goto dosend;
+ return ret;
+ }
+
+ if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
+ maxpkts = req->info.npkts - req->seqnum;
+
+ while (npkts < maxpkts) {
+ u32 datalen = 0, queued = 0, data_sent = 0;
+ u64 iov_offset = 0;
+
+ /*
+ * Check whether any of the completions have come back
+ * with errors. If so, we are not going to process any
+ * more packets from this request.
+ */
+ if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
+ set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+ return -EFAULT;
+ }
+
+ tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
+ if (!tx)
+ return -ENOMEM;
+
+ tx->flags = 0;
+ tx->req = req;
+ tx->busycount = 0;
+ INIT_LIST_HEAD(&tx->list);
+
+ if (req->seqnum == req->info.npkts - 1)
+ tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
+
+ /*
+ * Calculate the payload size - this is min of the fragment
+ * (MTU) size or the remaining bytes in the request but only
+ * if we have payload data.
+ */
+ if (req->data_len) {
+ iovec = &req->iovs[req->iov_idx];
+ if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+ if (++req->iov_idx == req->data_iovs) {
+ ret = -EFAULT;
+ goto free_txreq;
+ }
+ iovec = &req->iovs[req->iov_idx];
+ WARN_ON(iovec->offset);
+ }
+
+ datalen = compute_data_length(req, tx);
+ if (!datalen) {
+ SDMA_DBG(req,
+ "Request has data but pkt len is 0");
+ ret = -EFAULT;
+ goto free_tx;
+ }
+ }
+
+ if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
+ if (!req->seqnum) {
+ u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
+ u32 lrhlen = get_lrh_len(req->hdr,
+ pad_len(datalen));
+ /*
+ * Copy the request header into the tx header
+ * because the HW needs a cacheline-aligned
+ * address.
+ * This copy can be optimized out if the hdr
+ * member of user_sdma_request were also
+ * cacheline aligned.
+ */
+ memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
+ if (PBC2LRH(pbclen) != lrhlen) {
+ pbclen = (pbclen & 0xf000) |
+ LRH2PBC(lrhlen);
+ tx->hdr.pbc[0] = cpu_to_le16(pbclen);
+ }
+ ret = sdma_txinit_ahg(&tx->txreq,
+ SDMA_TXREQ_F_AHG_COPY,
+ sizeof(tx->hdr) + datalen,
+ req->ahg_idx, 0, NULL, 0,
+ user_sdma_txreq_cb);
+ if (ret)
+ goto free_tx;
+ ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
+ &tx->hdr,
+ sizeof(tx->hdr));
+ if (ret)
+ goto free_txreq;
+ } else {
+ int changes;
+
+ changes = set_txreq_header_ahg(req, tx,
+ datalen);
+ if (changes < 0)
+ goto free_tx;
+ sdma_txinit_ahg(&tx->txreq,
+ SDMA_TXREQ_F_USE_AHG,
+ datalen, req->ahg_idx, changes,
+ req->ahg, sizeof(req->hdr),
+ user_sdma_txreq_cb);
+ }
+ } else {
+ ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
+ datalen, user_sdma_txreq_cb);
+ if (ret)
+ goto free_tx;
+ /*
+ * Modify the header for this packet. This only needs
+ * to be done if we are not going to use AHG. Otherwise,
+ * the HW will do it based on the changes we gave it
+ * during sdma_txinit_ahg().
+ */
+ ret = set_txreq_header(req, tx, datalen);
+ if (ret)
+ goto free_txreq;
+ }
+
+ /*
+ * If the request contains any data vectors, add up to
+ * fragsize bytes to the descriptor.
+ */
+ while (queued < datalen &&
+ (req->sent + data_sent) < req->data_len) {
+ unsigned long base, offset;
+ unsigned pageidx, len;
+
+ base = (unsigned long)iovec->iov.iov_base;
+ offset = offset_in_page(base + iovec->offset +
+ iov_offset);
+ pageidx = (((iovec->offset + iov_offset +
+ base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
+ len = offset + req->info.fragsize > PAGE_SIZE ?
+ PAGE_SIZE - offset : req->info.fragsize;
+ len = min((datalen - queued), len);
+ ret = sdma_txadd_page(pq->dd, &tx->txreq,
+ iovec->pages[pageidx],
+ offset, len);
+ if (ret) {
+ SDMA_DBG(req, "SDMA txreq add page failed %d\n",
+ ret);
+ goto free_txreq;
+ }
+ iov_offset += len;
+ queued += len;
+ data_sent += len;
+ if (unlikely(queued < datalen &&
+ pageidx == iovec->npages &&
+ req->iov_idx < req->data_iovs - 1)) {
+ iovec->offset += iov_offset;
+ iovec = &req->iovs[++req->iov_idx];
+ iov_offset = 0;
+ }
+ }
+ /*
+ * The txreq was submitted successfully so we can update
+ * the counters.
+ */
+ req->koffset += datalen;
+ if (req_opcode(req->info.ctrl) == EXPECTED)
+ req->tidoffset += datalen;
+ req->sent += data_sent;
+ if (req->data_len)
+ iovec->offset += iov_offset;
+ list_add_tail(&tx->txreq.list, &req->txps);
+ /*
+ * It is important to increment this here as it is used to
+ * generate the BTH.PSN and, therefore, can't be bulk-updated
+ * outside of the loop.
+ */
+ tx->seqnum = req->seqnum++;
+ npkts++;
+ }
+dosend:
+ ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
+ if (list_empty(&req->txps)) {
+ req->seqsubmitted = req->seqnum;
+ if (req->seqnum == req->info.npkts) {
+ set_bit(SDMA_REQ_SEND_DONE, &req->flags);
+ /*
+ * The txreq has already been submitted to the HW queue
+ * so we can free the AHG entry now. Corruption will not
+ * happen due to the sequential manner in which
+ * descriptors are processed.
+ */
+ if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
+ sdma_ahg_free(req->sde, req->ahg_idx);
+ }
+ } else if (ret > 0) {
+ req->seqsubmitted += ret;
+ ret = 0;
+ }
+ return ret;
+
+free_txreq:
+ sdma_txclean(pq->dd, &tx->txreq);
+free_tx:
+ kmem_cache_free(pq->txreq_cache, tx);
+ return ret;
+}
+
+/*
+ * How many pages in this iovec element?
+ */
+static inline int num_user_pages(const struct iovec *iov)
+{
+ const unsigned long addr = (unsigned long)iov->iov_base;
+ const unsigned long len = iov->iov_len;
+ const unsigned long spage = addr & PAGE_MASK;
+ const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+ return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
+{
+ struct evict_data evict_data;
+
+ evict_data.cleared = 0;
+ evict_data.target = npages;
+ hfi1_mmu_rb_evict(pq->handler, &evict_data);
+ return evict_data.cleared;
+}
+
+static int pin_vector_pages(struct user_sdma_request *req,
+ struct user_sdma_iovec *iovec)
+{
+ int ret = 0, pinned, npages, cleared;
+ struct page **pages;
+ struct hfi1_user_sdma_pkt_q *pq = req->pq;
+ struct sdma_mmu_node *node = NULL;
+ struct mmu_rb_node *rb_node;
+
+ rb_node = hfi1_mmu_rb_extract(pq->handler,
+ (unsigned long)iovec->iov.iov_base,
+ iovec->iov.iov_len);
+ if (rb_node && !IS_ERR(rb_node))
+ node = container_of(rb_node, struct sdma_mmu_node, rb);
+ else
+ rb_node = NULL;
+
+ if (!node) {
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ node->rb.addr = (unsigned long)iovec->iov.iov_base;
+ node->pq = pq;
+ atomic_set(&node->refcount, 0);
+ }
+
+ npages = num_user_pages(&iovec->iov);
+ if (node->npages < npages) {
+ pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+ if (!pages) {
+ SDMA_DBG(req, "Failed page array alloc");
+ ret = -ENOMEM;
+ goto bail;
+ }
+ memcpy(pages, node->pages, node->npages * sizeof(*pages));
+
+ npages -= node->npages;
+
+retry:
+ if (!hfi1_can_pin_pages(pq->dd, pq->mm,
+ atomic_read(&pq->n_locked), npages)) {
+ cleared = sdma_cache_evict(pq, npages);
+ if (cleared >= npages)
+ goto retry;
+ }
+ pinned = hfi1_acquire_user_pages(pq->mm,
+ ((unsigned long)iovec->iov.iov_base +
+ (node->npages * PAGE_SIZE)), npages, 0,
+ pages + node->npages);
+ if (pinned < 0) {
+ kfree(pages);
+ ret = pinned;
+ goto bail;
+ }
+ if (pinned != npages) {
+ unpin_vector_pages(pq->mm, pages, node->npages,
+ pinned);
+ ret = -EFAULT;
+ goto bail;
+ }
+ kfree(node->pages);
+ node->rb.len = iovec->iov.iov_len;
+ node->pages = pages;
+ node->npages += pinned;
+ npages = node->npages;
+ atomic_add(pinned, &pq->n_locked);
+ }
+ iovec->pages = node->pages;
+ iovec->npages = npages;
+ iovec->node = node;
+
+ ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
+ if (ret) {
+ atomic_sub(node->npages, &pq->n_locked);
+ iovec->node = NULL;
+ goto bail;
+ }
+ return 0;
+bail:
+ if (rb_node)
+ unpin_vector_pages(pq->mm, node->pages, 0, node->npages);
+ kfree(node);
+ return ret;
+}
+
+static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
+ unsigned start, unsigned npages)
+{
+ hfi1_release_user_pages(mm, pages + start, npages, false);
+ kfree(pages);
+}
+
+static int check_header_template(struct user_sdma_request *req,
+ struct hfi1_pkt_header *hdr, u32 lrhlen,
+ u32 datalen)
+{
+ /*
+ * Perform safety checks for any type of packet:
+ * - transfer size is multiple of 64bytes
+ * - packet length is multiple of 4 bytes
+ * - packet length is not larger than MTU size
+ *
+ * These checks are only done for the first packet of the
+ * transfer since the header is "given" to us by user space.
+ * For the remainder of the packets we compute the values.
+ */
+ if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
+ lrhlen > get_lrh_len(*hdr, req->info.fragsize))
+ return -EINVAL;
+
+ if (req_opcode(req->info.ctrl) == EXPECTED) {
+ /*
+ * The header is checked only on the first packet. Furthermore,
+ * we ensure that at least one TID entry is copied when the
+ * request is submitted. Therefore, we don't have to verify that
+ * tididx points to something sane.
+ */
+ u32 tidval = req->tids[req->tididx],
+ tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
+ tididx = EXP_TID_GET(tidval, IDX),
+ tidctrl = EXP_TID_GET(tidval, CTRL),
+ tidoff;
+ __le32 kval = hdr->kdeth.ver_tid_offset;
+
+ tidoff = KDETH_GET(kval, OFFSET) *
+ (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+ KDETH_OM_LARGE : KDETH_OM_SMALL);
+ /*
+ * Expected receive packets have the following
+ * additional checks:
+ * - offset is not larger than the TID size
+ * - TIDCtrl values match between header and TID array
+ * - TID indexes match between header and TID array
+ */
+ if ((tidoff + datalen > tidlen) ||
+ KDETH_GET(kval, TIDCTRL) != tidctrl ||
+ KDETH_GET(kval, TID) != tididx)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Correctly set the BTH.PSN field based on type of
+ * transfer - eager packets can just increment the PSN but
+ * expected packets encode generation and sequence in the
+ * BTH.PSN field so just incrementing will result in errors.
+ */
+static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
+{
+ u32 val = be32_to_cpu(bthpsn),
+ mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
+ 0xffffffull),
+ psn = val & mask;
+ if (expct)
+ psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+ else
+ psn = psn + frags;
+ return psn & mask;
+}
+
+static int set_txreq_header(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx, u32 datalen)
+{
+ struct hfi1_user_sdma_pkt_q *pq = req->pq;
+ struct hfi1_pkt_header *hdr = &tx->hdr;
+ u16 pbclen;
+ int ret;
+ u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
+
+ /* Copy the header template to the request before modification */
+ memcpy(hdr, &req->hdr, sizeof(*hdr));
+
+ /*
+ * Check if the PBC and LRH length are mismatched. If so
+ * adjust both in the header.
+ */
+ pbclen = le16_to_cpu(hdr->pbc[0]);
+ if (PBC2LRH(pbclen) != lrhlen) {
+ pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
+ hdr->pbc[0] = cpu_to_le16(pbclen);
+ hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
+ /*
+ * Third packet
+ * This is the first packet in the sequence that has
+ * a "static" size that can be used for the rest of
+ * the packets (besides the last one).
+ */
+ if (unlikely(req->seqnum == 2)) {
+ /*
+ * From this point on the lengths in both the
+ * PBC and LRH are the same until the last
+ * packet.
+ * Adjust the template so we don't have to update
+ * every packet
+ */
+ req->hdr.pbc[0] = hdr->pbc[0];
+ req->hdr.lrh[2] = hdr->lrh[2];
+ }
+ }
+ /*
+ * We only have to modify the header if this is not the
+ * first packet in the request. Otherwise, we use the
+ * header given to us.
+ */
+ if (unlikely(!req->seqnum)) {
+ ret = check_header_template(req, hdr, lrhlen, datalen);
+ if (ret)
+ return ret;
+ goto done;
+ }
+
+ hdr->bth[2] = cpu_to_be32(
+ set_pkt_bth_psn(hdr->bth[2],
+ (req_opcode(req->info.ctrl) == EXPECTED),
+ req->seqnum));
+
+ /* Set ACK request on last packet */
+ if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+ hdr->bth[2] |= cpu_to_be32(1UL << 31);
+
+ /* Set the new offset */
+ hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
+ /* Expected packets have to fill in the new TID information */
+ if (req_opcode(req->info.ctrl) == EXPECTED) {
+ tidval = req->tids[req->tididx];
+ /*
+ * If the offset puts us at the end of the current TID,
+ * advance everything.
+ */
+ if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+ PAGE_SIZE)) {
+ req->tidoffset = 0;
+ /*
+ * Since we don't copy all the TIDs, all at once,
+ * we have to check again.
+ */
+ if (++req->tididx > req->n_tids - 1 ||
+ !req->tids[req->tididx]) {
+ return -EINVAL;
+ }
+ tidval = req->tids[req->tididx];
+ }
+ req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
+ KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
+ /* Set KDETH.TIDCtrl based on value for this TID. */
+ KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
+ EXP_TID_GET(tidval, CTRL));
+ /* Set KDETH.TID based on value for this TID */
+ KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
+ EXP_TID_GET(tidval, IDX));
+ /* Clear KDETH.SH only on the last packet */
+ if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+ KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
+ /*
+ * Set the KDETH.OFFSET and KDETH.OM based on size of
+ * transfer.
+ */
+ SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
+ req->tidoffset, req->tidoffset / req->omfactor,
+ req->omfactor != KDETH_OM_SMALL);
+ KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
+ req->tidoffset / req->omfactor);
+ KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
+ req->omfactor != KDETH_OM_SMALL);
+ }
+done:
+ trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
+ req->info.comp_idx, hdr, tidval);
+ return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
+}
+
+static int set_txreq_header_ahg(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx, u32 len)
+{
+ int diff = 0;
+ struct hfi1_user_sdma_pkt_q *pq = req->pq;
+ struct hfi1_pkt_header *hdr = &req->hdr;
+ u16 pbclen = le16_to_cpu(hdr->pbc[0]);
+ u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len));
+
+ if (PBC2LRH(pbclen) != lrhlen) {
+ /* PBC.PbcLengthDWs */
+ AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
+ cpu_to_le16(LRH2PBC(lrhlen)));
+ /* LRH.PktLen (we need the full 16 bits due to byte swap) */
+ AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
+ cpu_to_be16(lrhlen >> 2));
+ }
+
+ /*
+ * Do the common updates
+ */
+ /* BTH.PSN and BTH.A */
+ val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
+ (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
+ if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
+ val32 |= 1UL << 31;
+ AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
+ AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
+ /* KDETH.Offset */
+ AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
+ cpu_to_le16(req->koffset & 0xffff));
+ AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
+ cpu_to_le16(req->koffset >> 16));
+ if (req_opcode(req->info.ctrl) == EXPECTED) {
+ __le16 val;
+
+ tidval = req->tids[req->tididx];
+
+ /*
+ * If the offset puts us at the end of the current TID,
+ * advance everything.
+ */
+ if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+ PAGE_SIZE)) {
+ req->tidoffset = 0;
+ /*
+ * Since we don't copy all the TIDs, all at once,
+ * we have to check again.
+ */
+ if (++req->tididx > req->n_tids - 1 ||
+ !req->tids[req->tididx]) {
+ return -EINVAL;
+ }
+ tidval = req->tids[req->tididx];
+ }
+ req->omfactor = ((EXP_TID_GET(tidval, LEN) *
+ PAGE_SIZE) >=
+ KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
+ KDETH_OM_SMALL;
+ /* KDETH.OM and KDETH.OFFSET (TID) */
+ AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
+ ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
+ ((req->tidoffset / req->omfactor) & 0x7fff)));
+ /* KDETH.TIDCtrl, KDETH.TID */
+ val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
+ (EXP_TID_GET(tidval, IDX) & 0x3ff));
+ /* Clear KDETH.SH on last packet */
+ if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
+ val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
+ INTR) <<
+ AHG_KDETH_INTR_SHIFT);
+ val &= cpu_to_le16(~(1U << 13));
+ AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
+ } else {
+ AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
+ }
+ }
+
+ trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
+ req->info.comp_idx, req->sde->this_idx,
+ req->ahg_idx, req->ahg, diff, tidval);
+ return diff;
+}
+
+/*
+ * SDMA tx request completion callback. Called when the SDMA progress
+ * state machine gets notification that the SDMA descriptors for this
+ * tx request have been processed by the DMA engine. Called in
+ * interrupt context.
+ */
+static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
+{
+ struct user_sdma_txreq *tx =
+ container_of(txreq, struct user_sdma_txreq, txreq);
+ struct user_sdma_request *req;
+ struct hfi1_user_sdma_pkt_q *pq;
+ struct hfi1_user_sdma_comp_q *cq;
+ u16 idx;
+
+ if (!tx->req)
+ return;
+
+ req = tx->req;
+ pq = req->pq;
+ cq = req->cq;
+
+ if (status != SDMA_TXREQ_S_OK) {
+ SDMA_DBG(req, "SDMA completion with error %d",
+ status);
+ set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
+ }
+
+ req->seqcomp = tx->seqnum;
+ kmem_cache_free(pq->txreq_cache, tx);
+ tx = NULL;
+
+ idx = req->info.comp_idx;
+ if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
+ if (req->seqcomp == req->info.npkts - 1) {
+ req->status = 0;
+ user_sdma_free_request(req, false);
+ pq_update(pq);
+ set_comp_state(pq, cq, idx, COMPLETE, 0);
+ }
+ } else {
+ if (status != SDMA_TXREQ_S_OK)
+ req->status = status;
+ if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
+ (test_bit(SDMA_REQ_SEND_DONE, &req->flags) ||
+ test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) {
+ user_sdma_free_request(req, false);
+ pq_update(pq);
+ set_comp_state(pq, cq, idx, ERROR, req->status);
+ }
+ }
+}
+
+static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
+{
+ if (atomic_dec_and_test(&pq->n_reqs)) {
+ xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
+ wake_up(&pq->wait);
+ }
+}
+
+static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
+{
+ if (!list_empty(&req->txps)) {
+ struct sdma_txreq *t, *p;
+
+ list_for_each_entry_safe(t, p, &req->txps, list) {
+ struct user_sdma_txreq *tx =
+ container_of(t, struct user_sdma_txreq, txreq);
+ list_del_init(&t->list);
+ sdma_txclean(req->pq->dd, t);
+ kmem_cache_free(req->pq->txreq_cache, tx);
+ }
+ }
+ if (req->data_iovs) {
+ struct sdma_mmu_node *node;
+ int i;
+
+ for (i = 0; i < req->data_iovs; i++) {
+ node = req->iovs[i].node;
+ if (!node)
+ continue;
+
+ if (unpin)
+ hfi1_mmu_rb_remove(req->pq->handler,
+ &node->rb);
+ else
+ atomic_dec(&node->refcount);
+ }
+ }
+ kfree(req->tids);
+ clear_bit(req->info.comp_idx, req->pq->req_in_use);
+}
+
+static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
+ struct hfi1_user_sdma_comp_q *cq,
+ u16 idx, enum hfi1_sdma_comp_state state,
+ int ret)
+{
+ hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d",
+ pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret);
+ cq->comps[idx].status = state;
+ if (state == ERROR)
+ cq->comps[idx].errcode = -ret;
+ trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
+ idx, state, ret);
+}
+
+static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
+ unsigned long len)
+{
+ return (bool)(node->addr == addr);
+}
+
+static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
+{
+ struct sdma_mmu_node *node =
+ container_of(mnode, struct sdma_mmu_node, rb);
+
+ atomic_inc(&node->refcount);
+ return 0;
+}
+
+/*
+ * Return 1 to remove the node from the rb tree and call the remove op.
+ *
+ * Called with the rb tree lock held.
+ */
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
+ void *evict_arg, bool *stop)
+{
+ struct sdma_mmu_node *node =
+ container_of(mnode, struct sdma_mmu_node, rb);
+ struct evict_data *evict_data = evict_arg;
+
+ /* is this node still being used? */
+ if (atomic_read(&node->refcount))
+ return 0; /* keep this node */
+
+ /* this node will be evicted, add its pages to our count */
+ evict_data->cleared += node->npages;
+
+ /* have enough pages been cleared? */
+ if (evict_data->cleared >= evict_data->target)
+ *stop = true;
+
+ return 1; /* remove this node */
+}
+
+static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
+{
+ struct sdma_mmu_node *node =
+ container_of(mnode, struct sdma_mmu_node, rb);
+
+ atomic_sub(node->npages, &node->pq->n_locked);
+
+ unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
+
+ kfree(node);
+}
+
+static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
+{
+ struct sdma_mmu_node *node =
+ container_of(mnode, struct sdma_mmu_node, rb);
+
+ if (!atomic_read(&node->refcount))
+ return 1;
+ return 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
new file mode 100644
index 000000000000..39001714f551
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/user_sdma.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/device.h>
+#include <linux/wait.h>
+
+#include "common.h"
+#include "iowait.h"
+#include "user_exp_rcv.h"
+
+extern uint extended_psn;
+
+struct hfi1_user_sdma_pkt_q {
+ struct list_head list;
+ unsigned ctxt;
+ unsigned subctxt;
+ u16 n_max_reqs;
+ atomic_t n_reqs;
+ u16 reqidx;
+ struct hfi1_devdata *dd;
+ struct kmem_cache *txreq_cache;
+ struct user_sdma_request *reqs;
+ unsigned long *req_in_use;
+ struct iowait busy;
+ unsigned state;
+ wait_queue_head_t wait;
+ unsigned long unpinned;
+ struct mmu_rb_handler *handler;
+ atomic_t n_locked;
+ struct mm_struct *mm;
+};
+
+struct hfi1_user_sdma_comp_q {
+ u16 nentries;
+ struct hfi1_sdma_comp_entry *comps;
+};
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
+int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
+ unsigned long *);
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
new file mode 100644
index 000000000000..2b359540901d
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -0,0 +1,1789 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "device.h"
+#include "trace.h"
+#include "qp.h"
+#include "verbs_txreq.h"
+
+static unsigned int hfi1_lkey_table_size = 16;
+module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
+ S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+ "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int hfi1_max_pds = 0xFFFF;
+module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+ "Maximum number of protection domains to support");
+
+static unsigned int hfi1_max_ahs = 0xFFFF;
+module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int hfi1_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+ "Maximum number of completion queue entries to support");
+
+unsigned int hfi1_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int hfi1_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int hfi1_max_qps = 16384;
+module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int hfi1_max_sges = 0x60;
+module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int hfi1_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+ "Maximum number of multicast groups to support");
+
+unsigned int hfi1_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
+ uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+ "Maximum number of attached QPs to support");
+
+unsigned int hfi1_max_srqs = 1024;
+module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int hfi1_max_srq_sges = 128;
+module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int hfi1_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+unsigned short piothreshold = 256;
+module_param(piothreshold, ushort, S_IRUGO);
+MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
+
+#define COPY_CACHELESS 1
+#define COPY_ADAPTIVE 2
+static unsigned int sge_copy_mode;
+module_param(sge_copy_mode, uint, S_IRUGO);
+MODULE_PARM_DESC(sge_copy_mode,
+ "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
+
+static void verbs_sdma_complete(
+ struct sdma_txreq *cookie,
+ int status);
+
+static int pio_wait(struct rvt_qp *qp,
+ struct send_context *sc,
+ struct hfi1_pkt_state *ps,
+ u32 flag);
+
+/* Length of buffer to create verbs txreq cache name */
+#define TXREQ_NAME_LEN 24
+
+static uint wss_threshold;
+module_param(wss_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
+static uint wss_clean_period = 256;
+module_param(wss_clean_period, uint, S_IRUGO);
+MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
+
+/* memory working set size */
+struct hfi1_wss {
+ unsigned long *entries;
+ atomic_t total_count;
+ atomic_t clean_counter;
+ atomic_t clean_entry;
+
+ int threshold;
+ int num_entries;
+ long pages_mask;
+};
+
+static struct hfi1_wss wss;
+
+int hfi1_wss_init(void)
+{
+ long llc_size;
+ long llc_bits;
+ long table_size;
+ long table_bits;
+
+ /* check for a valid percent range - default to 80 if none or invalid */
+ if (wss_threshold < 1 || wss_threshold > 100)
+ wss_threshold = 80;
+ /* reject a wildly large period */
+ if (wss_clean_period > 1000000)
+ wss_clean_period = 256;
+ /* reject a zero period */
+ if (wss_clean_period == 0)
+ wss_clean_period = 1;
+
+ /*
+ * Calculate the table size - the next power of 2 larger than the
+ * LLC size. LLC size is in KiB.
+ */
+ llc_size = wss_llc_size() * 1024;
+ table_size = roundup_pow_of_two(llc_size);
+
+ /* one bit per page in rounded up table */
+ llc_bits = llc_size / PAGE_SIZE;
+ table_bits = table_size / PAGE_SIZE;
+ wss.pages_mask = table_bits - 1;
+ wss.num_entries = table_bits / BITS_PER_LONG;
+
+ wss.threshold = (llc_bits * wss_threshold) / 100;
+ if (wss.threshold == 0)
+ wss.threshold = 1;
+
+ atomic_set(&wss.clean_counter, wss_clean_period);
+
+ wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
+ GFP_KERNEL);
+ if (!wss.entries) {
+ hfi1_wss_exit();
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void hfi1_wss_exit(void)
+{
+ /* coded to handle partially initialized and repeat callers */
+ kfree(wss.entries);
+ wss.entries = NULL;
+}
+
+/*
+ * Advance the clean counter. When the clean period has expired,
+ * clean an entry.
+ *
+ * This is implemented in atomics to avoid locking. Because multiple
+ * variables are involved, it can be racy which can lead to slightly
+ * inaccurate information. Since this is only a heuristic, this is
+ * OK. Any innaccuracies will clean themselves out as the counter
+ * advances. That said, it is unlikely the entry clean operation will
+ * race - the next possible racer will not start until the next clean
+ * period.
+ *
+ * The clean counter is implemented as a decrement to zero. When zero
+ * is reached an entry is cleaned.
+ */
+static void wss_advance_clean_counter(void)
+{
+ int entry;
+ int weight;
+ unsigned long bits;
+
+ /* become the cleaner if we decrement the counter to zero */
+ if (atomic_dec_and_test(&wss.clean_counter)) {
+ /*
+ * Set, not add, the clean period. This avoids an issue
+ * where the counter could decrement below the clean period.
+ * Doing a set can result in lost decrements, slowing the
+ * clean advance. Since this a heuristic, this possible
+ * slowdown is OK.
+ *
+ * An alternative is to loop, advancing the counter by a
+ * clean period until the result is > 0. However, this could
+ * lead to several threads keeping another in the clean loop.
+ * This could be mitigated by limiting the number of times
+ * we stay in the loop.
+ */
+ atomic_set(&wss.clean_counter, wss_clean_period);
+
+ /*
+ * Uniquely grab the entry to clean and move to next.
+ * The current entry is always the lower bits of
+ * wss.clean_entry. The table size, wss.num_entries,
+ * is always a power-of-2.
+ */
+ entry = (atomic_inc_return(&wss.clean_entry) - 1)
+ & (wss.num_entries - 1);
+
+ /* clear the entry and count the bits */
+ bits = xchg(&wss.entries[entry], 0);
+ weight = hweight64((u64)bits);
+ /* only adjust the contended total count if needed */
+ if (weight)
+ atomic_sub(weight, &wss.total_count);
+ }
+}
+
+/*
+ * Insert the given address into the working set array.
+ */
+static void wss_insert(void *address)
+{
+ u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
+ u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
+ u32 nr = page & (BITS_PER_LONG - 1);
+
+ if (!test_and_set_bit(nr, &wss.entries[entry]))
+ atomic_inc(&wss.total_count);
+
+ wss_advance_clean_counter();
+}
+
+/*
+ * Is the working set larger than the threshold?
+ */
+static inline int wss_exceeds_threshold(void)
+{
+ return atomic_read(&wss.total_count) >= wss.threshold;
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
+ [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+ [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+ [IB_WR_SEND] = IB_WC_SEND,
+ [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+ [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+ [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
+ [IB_WR_SEND_WITH_INV] = IB_WC_SEND,
+ [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
+ [IB_WR_REG_MR] = IB_WC_REG_MR
+};
+
+/*
+ * Length of header by opcode, 0 --> not supported
+ */
+const u8 hdr_len_by_opcode[256] = {
+ /* RC */
+ [IB_OPCODE_RC_SEND_FIRST] = 12 + 8,
+ [IB_OPCODE_RC_SEND_MIDDLE] = 12 + 8,
+ [IB_OPCODE_RC_SEND_LAST] = 12 + 8,
+ [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_SEND_ONLY] = 12 + 8,
+ [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_RDMA_WRITE_FIRST] = 12 + 8 + 16,
+ [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = 12 + 8,
+ [IB_OPCODE_RC_RDMA_WRITE_LAST] = 12 + 8,
+ [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY] = 12 + 8 + 16,
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+ [IB_OPCODE_RC_RDMA_READ_REQUEST] = 12 + 8 + 16,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = 12 + 8 + 4,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = 12 + 8,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = 12 + 8 + 4,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = 12 + 8 + 4,
+ [IB_OPCODE_RC_ACKNOWLEDGE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_COMPARE_SWAP] = 12 + 8 + 28,
+ [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28,
+ [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4,
+ [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4,
+ /* UC */
+ [IB_OPCODE_UC_SEND_FIRST] = 12 + 8,
+ [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8,
+ [IB_OPCODE_UC_SEND_LAST] = 12 + 8,
+ [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_UC_SEND_ONLY] = 12 + 8,
+ [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_UC_RDMA_WRITE_FIRST] = 12 + 8 + 16,
+ [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = 12 + 8,
+ [IB_OPCODE_UC_RDMA_WRITE_LAST] = 12 + 8,
+ [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY] = 12 + 8 + 16,
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+ /* UD */
+ [IB_OPCODE_UD_SEND_ONLY] = 12 + 8 + 8,
+ [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 12
+};
+
+static const opcode_handler opcode_handler_tbl[256] = {
+ /* RC */
+ [IB_OPCODE_RC_SEND_FIRST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_MIDDLE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_LAST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_ONLY] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_FIRST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_LAST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_REQUEST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_ACKNOWLEDGE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv,
+ [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv,
+ /* UC */
+ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_LAST] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_ONLY] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_FIRST] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_LAST] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY] = &hfi1_uc_rcv,
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+ /* UD */
+ [IB_OPCODE_UD_SEND_ONLY] = &hfi1_ud_rcv,
+ [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_ud_rcv,
+ /* CNP */
+ [IB_OPCODE_CNP] = &hfi1_cnp_rcv
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_hfi1_sys_image_guid;
+
+/**
+ * hfi1_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ * @copy_last: do a separate copy of the last 8 bytes
+ */
+void hfi1_copy_sge(
+ struct rvt_sge_state *ss,
+ void *data, u32 length,
+ int release,
+ int copy_last)
+{
+ struct rvt_sge *sge = &ss->sge;
+ int in_last = 0;
+ int i;
+ int cacheless_copy = 0;
+
+ if (sge_copy_mode == COPY_CACHELESS) {
+ cacheless_copy = length >= PAGE_SIZE;
+ } else if (sge_copy_mode == COPY_ADAPTIVE) {
+ if (length >= PAGE_SIZE) {
+ /*
+ * NOTE: this *assumes*:
+ * o The first vaddr is the dest.
+ * o If multiple pages, then vaddr is sequential.
+ */
+ wss_insert(sge->vaddr);
+ if (length >= (2 * PAGE_SIZE))
+ wss_insert(sge->vaddr + PAGE_SIZE);
+
+ cacheless_copy = wss_exceeds_threshold();
+ } else {
+ wss_advance_clean_counter();
+ }
+ }
+ if (copy_last) {
+ if (length > 8) {
+ length -= 8;
+ } else {
+ copy_last = 0;
+ in_last = 1;
+ }
+ }
+
+again:
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ WARN_ON_ONCE(len == 0);
+ if (unlikely(in_last)) {
+ /* enforce byte transfer ordering */
+ for (i = 0; i < len; i++)
+ ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
+ } else if (cacheless_copy) {
+ cacheless_memcpy(sge->vaddr, data, len);
+ } else {
+ memcpy(sge->vaddr, data, len);
+ }
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (release)
+ rvt_put_mr(sge->mr);
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ data += len;
+ length -= len;
+ }
+
+ if (copy_last) {
+ copy_last = 0;
+ in_last = 1;
+ length = 8;
+ goto again;
+ }
+}
+
+/**
+ * hfi1_skip_sge - skip over SGE memory
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
+{
+ struct rvt_sge *sge = &ss->sge;
+
+ while (length) {
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+ WARN_ON_ONCE(len == 0);
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (sge->sge_length == 0) {
+ if (release)
+ rvt_put_mr(sge->mr);
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ break;
+ sge->n = 0;
+ }
+ sge->vaddr =
+ sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length =
+ sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+}
+
+/*
+ * Make sure the QP is ready and able to accept the given opcode.
+ */
+static inline opcode_handler qp_ok(int opcode, struct hfi1_packet *packet)
+{
+ if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
+ return NULL;
+ if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+ (opcode == IB_OPCODE_CNP))
+ return opcode_handler_tbl[opcode];
+
+ return NULL;
+}
+
+/**
+ * hfi1_ib_rcv - process an incoming packet
+ * @packet: data packet information
+ *
+ * This is called to process an incoming packet at interrupt level.
+ *
+ * Tlen is the length of the header + data + CRC in bytes.
+ */
+void hfi1_ib_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct hfi1_ib_header *hdr = packet->hdr;
+ u32 tlen = packet->tlen;
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+ opcode_handler packet_handler;
+ unsigned long flags;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+ u16 lid;
+
+ /* Check for GRH */
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh == HFI1_LRH_BTH) {
+ packet->ohdr = &hdr->u.oth;
+ } else if (lnh == HFI1_LRH_GRH) {
+ u32 vtf;
+
+ packet->ohdr = &hdr->u.l.oth;
+ if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+ goto drop;
+ vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+ if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+ goto drop;
+ packet->rcv_flags |= HFI1_HAS_GRH;
+ } else {
+ goto drop;
+ }
+
+ trace_input_ibhdr(rcd->dd, hdr);
+
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK;
+ lid = be16_to_cpu(hdr->lrh[1]);
+ if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+ (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) {
+ struct rvt_mcast *mcast;
+ struct rvt_mcast_qp *p;
+
+ if (lnh != HFI1_LRH_GRH)
+ goto drop;
+ mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid);
+ if (!mcast)
+ goto drop;
+ list_for_each_entry_rcu(p, &mcast->qp_list, list) {
+ packet->qp = p->qp;
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
+ packet_handler = qp_ok(opcode, packet);
+ if (likely(packet_handler))
+ packet_handler(packet);
+ else
+ ibp->rvp.n_pkt_drops++;
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+ }
+ /*
+ * Notify rvt_multicast_detach() if it is waiting for us
+ * to finish.
+ */
+ if (atomic_dec_return(&mcast->refcount) <= 1)
+ wake_up(&mcast->wait);
+ } else {
+ rcu_read_lock();
+ packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!packet->qp) {
+ rcu_read_unlock();
+ goto drop;
+ }
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
+ packet_handler = qp_ok(opcode, packet);
+ if (likely(packet_handler))
+ packet_handler(packet);
+ else
+ ibp->rvp.n_pkt_drops++;
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+ rcu_read_unlock();
+ }
+ return;
+
+drop:
+ ibp->rvp.n_pkt_drops++;
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(unsigned long data)
+{
+ struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
+ struct list_head *list = &dev->memwait;
+ struct rvt_qp *qp = NULL;
+ struct iowait *wait;
+ unsigned long flags;
+ struct hfi1_qp_priv *priv;
+
+ write_seqlock_irqsave(&dev->iowait_lock, flags);
+ if (!list_empty(list)) {
+ wait = list_first_entry(list, struct iowait, list);
+ qp = iowait_to_qp(wait);
+ priv = qp->priv;
+ list_del_init(&priv->s_iowait.list);
+ /* refcount held until actual wake up */
+ if (!list_empty(list))
+ mod_timer(&dev->mem_timer, jiffies + 1);
+ }
+ write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+ if (qp)
+ hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
+}
+
+void update_sge(struct rvt_sge_state *ss, u32 length)
+{
+ struct rvt_sge *sge = &ss->sge;
+
+ sge->vaddr += length;
+ sge->length -= length;
+ sge->sge_length -= length;
+ if (sge->sge_length == 0) {
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ return;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+}
+
+/*
+ * This is called with progress side lock held.
+ */
+/* New API */
+static void verbs_sdma_complete(
+ struct sdma_txreq *cookie,
+ int status)
+{
+ struct verbs_txreq *tx =
+ container_of(cookie, struct verbs_txreq, txreq);
+ struct rvt_qp *qp = tx->qp;
+
+ spin_lock(&qp->s_lock);
+ if (tx->wqe) {
+ hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+ } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+ struct hfi1_ib_header *hdr;
+
+ hdr = &tx->phdr.hdr;
+ hfi1_rc_send_complete(qp, hdr);
+ }
+ spin_unlock(&qp->s_lock);
+
+ hfi1_put_txreq(tx);
+}
+
+static int wait_kmem(struct hfi1_ibdev *dev,
+ struct rvt_qp *qp,
+ struct hfi1_pkt_state *ps)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+ write_seqlock(&dev->iowait_lock);
+ list_add_tail(&ps->s_txreq->txreq.list,
+ &priv->s_iowait.tx_head);
+ if (list_empty(&priv->s_iowait.list)) {
+ if (list_empty(&dev->memwait))
+ mod_timer(&dev->mem_timer, jiffies + 1);
+ qp->s_flags |= RVT_S_WAIT_KMEM;
+ list_add_tail(&priv->s_iowait.list, &dev->memwait);
+ trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+ atomic_inc(&qp->refcount);
+ }
+ write_sequnlock(&dev->iowait_lock);
+ qp->s_flags &= ~RVT_S_BUSY;
+ ret = -EBUSY;
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ return ret;
+}
+
+/*
+ * This routine calls txadds for each sg entry.
+ *
+ * Add failures will revert the sge cursor
+ */
+static noinline int build_verbs_ulp_payload(
+ struct sdma_engine *sde,
+ struct rvt_sge_state *ss,
+ u32 length,
+ struct verbs_txreq *tx)
+{
+ struct rvt_sge *sg_list = ss->sg_list;
+ struct rvt_sge sge = ss->sge;
+ u8 num_sge = ss->num_sge;
+ u32 len;
+ int ret = 0;
+
+ while (length) {
+ len = ss->sge.length;
+ if (len > length)
+ len = length;
+ if (len > ss->sge.sge_length)
+ len = ss->sge.sge_length;
+ WARN_ON_ONCE(len == 0);
+ ret = sdma_txadd_kvaddr(
+ sde->dd,
+ &tx->txreq,
+ ss->sge.vaddr,
+ len);
+ if (ret)
+ goto bail_txadd;
+ update_sge(ss, len);
+ length -= len;
+ }
+ return ret;
+bail_txadd:
+ /* unwind cursor */
+ ss->sge = sge;
+ ss->num_sge = num_sge;
+ ss->sg_list = sg_list;
+ return ret;
+}
+
+/*
+ * Build the number of DMA descriptors needed to send length bytes of data.
+ *
+ * NOTE: DMA mapping is held in the tx until completed in the ring or
+ * the tx desc is freed without having been submitted to the ring
+ *
+ * This routine ensures all the helper routine calls succeed.
+ */
+/* New API */
+static int build_verbs_tx_desc(
+ struct sdma_engine *sde,
+ struct rvt_sge_state *ss,
+ u32 length,
+ struct verbs_txreq *tx,
+ struct hfi1_ahg_info *ahg_info,
+ u64 pbc)
+{
+ int ret = 0;
+ struct hfi1_sdma_header *phdr = &tx->phdr;
+ u16 hdrbytes = tx->hdr_dwords << 2;
+
+ if (!ahg_info->ahgcount) {
+ ret = sdma_txinit_ahg(
+ &tx->txreq,
+ ahg_info->tx_flags,
+ hdrbytes + length,
+ ahg_info->ahgidx,
+ 0,
+ NULL,
+ 0,
+ verbs_sdma_complete);
+ if (ret)
+ goto bail_txadd;
+ phdr->pbc = cpu_to_le64(pbc);
+ ret = sdma_txadd_kvaddr(
+ sde->dd,
+ &tx->txreq,
+ phdr,
+ hdrbytes);
+ if (ret)
+ goto bail_txadd;
+ } else {
+ ret = sdma_txinit_ahg(
+ &tx->txreq,
+ ahg_info->tx_flags,
+ length,
+ ahg_info->ahgidx,
+ ahg_info->ahgcount,
+ ahg_info->ahgdesc,
+ hdrbytes,
+ verbs_sdma_complete);
+ if (ret)
+ goto bail_txadd;
+ }
+
+ /* add the ulp payload - if any. ss can be NULL for acks */
+ if (ss)
+ ret = build_verbs_ulp_payload(sde, ss, length, tx);
+bail_txadd:
+ return ret;
+}
+
+int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ u64 pbc)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ahg_info *ahg_info = priv->s_ahg;
+ u32 hdrwords = qp->s_hdrwords;
+ struct rvt_sge_state *ss = qp->s_cur_sge;
+ u32 len = qp->s_cur_size;
+ u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */
+ struct hfi1_ibdev *dev = ps->dev;
+ struct hfi1_pportdata *ppd = ps->ppd;
+ struct verbs_txreq *tx;
+ u64 pbc_flags = 0;
+ u8 sc5 = priv->s_sc;
+
+ int ret;
+
+ tx = ps->s_txreq;
+ if (!sdma_txreq_built(&tx->txreq)) {
+ if (likely(pbc == 0)) {
+ u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+ /* No vl15 here */
+ /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+ pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+
+ pbc = create_pbc(ppd,
+ pbc_flags,
+ qp->srate_mbps,
+ vl,
+ plen);
+ }
+ tx->wqe = qp->s_wqe;
+ ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahg_info, pbc);
+ if (unlikely(ret))
+ goto bail_build;
+ }
+ ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq);
+ if (unlikely(ret < 0)) {
+ if (ret == -ECOMM)
+ goto bail_ecomm;
+ return ret;
+ }
+ trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+ &ps->s_txreq->phdr.hdr);
+ return ret;
+
+bail_ecomm:
+ /* The current one got "sent" */
+ return 0;
+bail_build:
+ ret = wait_kmem(dev, qp, ps);
+ if (!ret) {
+ /* free txreq - bad state */
+ hfi1_put_txreq(ps->s_txreq);
+ ps->s_txreq = NULL;
+ }
+ return ret;
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int pio_wait(struct rvt_qp *qp,
+ struct send_context *sc,
+ struct hfi1_pkt_state *ps,
+ u32 flag)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_devdata *dd = sc->dd;
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ unsigned long flags;
+ int ret = 0;
+
+ /*
+ * Note that as soon as want_buffer() is called and
+ * possibly before it returns, sc_piobufavail()
+ * could be called. Therefore, put QP on the I/O wait list before
+ * enabling the PIO avail interrupt.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+ write_seqlock(&dev->iowait_lock);
+ list_add_tail(&ps->s_txreq->txreq.list,
+ &priv->s_iowait.tx_head);
+ if (list_empty(&priv->s_iowait.list)) {
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ int was_empty;
+
+ dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
+ dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN);
+ qp->s_flags |= flag;
+ was_empty = list_empty(&sc->piowait);
+ list_add_tail(&priv->s_iowait.list, &sc->piowait);
+ trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
+ atomic_inc(&qp->refcount);
+ /* counting: only call wantpiobuf_intr if first user */
+ if (was_empty)
+ hfi1_sc_wantpiobuf_intr(sc, 1);
+ }
+ write_sequnlock(&dev->iowait_lock);
+ qp->s_flags &= ~RVT_S_BUSY;
+ ret = -EBUSY;
+ }
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return ret;
+}
+
+static void verbs_pio_complete(void *arg, int code)
+{
+ struct rvt_qp *qp = (struct rvt_qp *)arg;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (iowait_pio_dec(&priv->s_iowait))
+ iowait_drain_wakeup(&priv->s_iowait);
+}
+
+int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ u64 pbc)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 hdrwords = qp->s_hdrwords;
+ struct rvt_sge_state *ss = qp->s_cur_sge;
+ u32 len = qp->s_cur_size;
+ u32 dwords = (len + 3) >> 2;
+ u32 plen = hdrwords + dwords + 2; /* includes pbc */
+ struct hfi1_pportdata *ppd = ps->ppd;
+ u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr;
+ u64 pbc_flags = 0;
+ u8 sc5;
+ unsigned long flags = 0;
+ struct send_context *sc;
+ struct pio_buf *pbuf;
+ int wc_status = IB_WC_SUCCESS;
+ int ret = 0;
+ pio_release_cb cb = NULL;
+
+ /* only RC/UC use complete */
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ cb = verbs_pio_complete;
+ break;
+ default:
+ break;
+ }
+
+ /* vl15 special case taken care of in ud.c */
+ sc5 = priv->s_sc;
+ sc = ps->s_txreq->psc;
+
+ if (likely(pbc == 0)) {
+ u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+ /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+ pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+ pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+ }
+ if (cb)
+ iowait_pio_inc(&priv->s_iowait);
+ pbuf = sc_buffer_alloc(sc, plen, cb, qp);
+ if (unlikely(!pbuf)) {
+ if (cb)
+ verbs_pio_complete(qp, 0);
+ if (ppd->host_link_state != HLS_UP_ACTIVE) {
+ /*
+ * If we have filled the PIO buffers to capacity and are
+ * not in an active state this request is not going to
+ * go out to so just complete it with an error or else a
+ * ULP or the core may be stuck waiting.
+ */
+ hfi1_cdbg(
+ PIO,
+ "alloc failed. state not active, completing");
+ wc_status = IB_WC_GENERAL_ERR;
+ goto pio_bail;
+ } else {
+ /*
+ * This is a normal occurrence. The PIO buffs are full
+ * up but we are still happily sending, well we could be
+ * so lets continue to queue the request.
+ */
+ hfi1_cdbg(PIO, "alloc failed. state active, queuing");
+ ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
+ if (!ret)
+ /* txreq not queued - free */
+ goto bail;
+ /* tx consumed in wait */
+ return ret;
+ }
+ }
+
+ if (len == 0) {
+ pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
+ } else {
+ if (ss) {
+ seg_pio_copy_start(pbuf, pbc, hdr, hdrwords * 4);
+ while (len) {
+ void *addr = ss->sge.vaddr;
+ u32 slen = ss->sge.length;
+
+ if (slen > len)
+ slen = len;
+ update_sge(ss, slen);
+ seg_pio_copy_mid(pbuf, addr, slen);
+ len -= slen;
+ }
+ seg_pio_copy_end(pbuf);
+ }
+ }
+
+ trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
+ &ps->s_txreq->phdr.hdr);
+
+pio_bail:
+ if (qp->s_wqe) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_send_complete(qp, qp->s_wqe, wc_status);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+
+ ret = 0;
+
+bail:
+ hfi1_put_txreq(ps->s_txreq);
+ return ret;
+}
+
+/*
+ * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the partition key table), return 0
+ * otherwise. Use the matching criteria for egress partition keys
+ * specified in the OPAv1 spec., section 9.1l.7.
+ */
+static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+ u16 mkey = pkey & PKEY_LOW_15_MASK;
+ u16 mentry = ent & PKEY_LOW_15_MASK;
+
+ if (mkey == mentry) {
+ /*
+ * If pkey[15] is set (full partition member),
+ * is bit 15 in the corresponding table element
+ * clear (limited member)?
+ */
+ if (pkey & PKEY_MEMBER_MASK)
+ return !!(ent & PKEY_MEMBER_MASK);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * egress_pkey_check - check P_KEY of a packet
+ * @ppd: Physical IB port data
+ * @lrh: Local route header
+ * @bth: Base transport header
+ * @sc5: SC for packet
+ * @s_pkey_index: It will be used for look up optimization for kernel contexts
+ * only. If it is negative value, then it means user contexts is calling this
+ * function.
+ *
+ * It checks if hdr's pkey is valid.
+ *
+ * Return: 0 on success, otherwise, 1
+ */
+int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
+ u8 sc5, int8_t s_pkey_index)
+{
+ struct hfi1_devdata *dd;
+ int i;
+ u16 pkey;
+ int is_user_ctxt_mechanism = (s_pkey_index < 0);
+
+ if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
+ return 0;
+
+ pkey = (u16)be32_to_cpu(bth[0]);
+
+ /* If SC15, pkey[0:14] must be 0x7fff */
+ if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+ goto bad;
+
+ /* Is the pkey = 0x0, or 0x8000? */
+ if ((pkey & PKEY_LOW_15_MASK) == 0)
+ goto bad;
+
+ /*
+ * For the kernel contexts only, if a qp is passed into the function,
+ * the most likely matching pkey has index qp->s_pkey_index
+ */
+ if (!is_user_ctxt_mechanism &&
+ egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
+ return 0;
+ }
+
+ for (i = 0; i < MAX_PKEY_VALUES; i++) {
+ if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+ return 0;
+ }
+bad:
+ /*
+ * For the user-context mechanism, the P_KEY check would only happen
+ * once per SDMA request, not once per packet. Therefore, there's no
+ * need to increment the counter for the user-context mechanism.
+ */
+ if (!is_user_ctxt_mechanism) {
+ incr_cntr64(&ppd->port_xmit_constraint_errors);
+ dd = ppd->dd;
+ if (!(dd->err_info_xmit_constraint.status &
+ OPA_EI_STATUS_SMASK)) {
+ u16 slid = be16_to_cpu(lrh[3]);
+
+ dd->err_info_xmit_constraint.status |=
+ OPA_EI_STATUS_SMASK;
+ dd->err_info_xmit_constraint.slid = slid;
+ dd->err_info_xmit_constraint.pkey = pkey;
+ }
+ }
+ return 1;
+}
+
+/**
+ * get_send_routine - choose an egress routine
+ *
+ * Choose an egress routine based on QP type
+ * and size
+ */
+static inline send_routine get_send_routine(struct rvt_qp *qp,
+ struct verbs_txreq *tx)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ib_header *h = &tx->phdr.hdr;
+
+ if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
+ return dd->process_pio_send;
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_SMI:
+ return dd->process_pio_send;
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ break;
+ case IB_QPT_RC:
+ if (piothreshold &&
+ qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
+ (BIT(get_opcode(h) & 0x1f) & rc_only_opcode) &&
+ iowait_sdma_pending(&priv->s_iowait) == 0 &&
+ !sdma_txreq_built(&tx->txreq))
+ return dd->process_pio_send;
+ break;
+ case IB_QPT_UC:
+ if (piothreshold &&
+ qp->s_cur_size <= min(piothreshold, qp->pmtu) &&
+ (BIT(get_opcode(h) & 0x1f) & uc_only_opcode) &&
+ iowait_sdma_pending(&priv->s_iowait) == 0 &&
+ !sdma_txreq_built(&tx->txreq))
+ return dd->process_pio_send;
+ break;
+ default:
+ break;
+ }
+ return dd->process_dma_send;
+}
+
+/**
+ * hfi1_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @ps: the state of the packet to send
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
+ */
+int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_other_headers *ohdr;
+ struct hfi1_ib_header *hdr;
+ send_routine sr;
+ int ret;
+ u8 lnh;
+
+ hdr = &ps->s_txreq->phdr.hdr;
+ /* locate the pkey within the headers */
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh == HFI1_LRH_GRH)
+ ohdr = &hdr->u.l.oth;
+ else
+ ohdr = &hdr->u.oth;
+
+ sr = get_send_routine(qp, ps->s_txreq);
+ ret = egress_pkey_check(dd->pport,
+ hdr->lrh,
+ ohdr->bth,
+ priv->s_sc,
+ qp->s_pkey_index);
+ if (unlikely(ret)) {
+ /*
+ * The value we are returning here does not get propagated to
+ * the verbs caller. Thus we need to complete the request with
+ * error otherwise the caller could be sitting waiting on the
+ * completion event. Only do this for PIO. SDMA has its own
+ * mechanism for handling the errors. So for SDMA we can just
+ * return.
+ */
+ if (sr == dd->process_pio_send) {
+ unsigned long flags;
+
+ hfi1_cdbg(PIO, "%s() Failed. Completing with err",
+ __func__);
+ spin_lock_irqsave(&qp->s_lock, flags);
+ hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+ return -EINVAL;
+ }
+ if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
+ return pio_wait(qp,
+ ps->s_txreq->psc,
+ ps,
+ RVT_S_WAIT_PIO_DRAIN);
+ return sr(qp, ps, 0);
+}
+
+/**
+ * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
+ * @dd: the device data structure
+ */
+static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
+{
+ struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+ u16 ver = dd->dc8051_ver;
+
+ memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
+
+ rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 16) |
+ (u64)dc8051_ver_min(ver);
+ rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+ IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+ IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+ IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE |
+ IB_DEVICE_MEM_MGT_EXTENSIONS;
+ rdi->dparms.props.page_size_cap = PAGE_SIZE;
+ rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
+ rdi->dparms.props.vendor_part_id = dd->pcidev->device;
+ rdi->dparms.props.hw_ver = dd->minrev;
+ rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
+ rdi->dparms.props.max_mr_size = U64_MAX;
+ rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
+ rdi->dparms.props.max_qp = hfi1_max_qps;
+ rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+ rdi->dparms.props.max_sge = hfi1_max_sges;
+ rdi->dparms.props.max_sge_rd = hfi1_max_sges;
+ rdi->dparms.props.max_cq = hfi1_max_cqs;
+ rdi->dparms.props.max_ah = hfi1_max_ahs;
+ rdi->dparms.props.max_cqe = hfi1_max_cqes;
+ rdi->dparms.props.max_mr = rdi->lkey_table.max;
+ rdi->dparms.props.max_fmr = rdi->lkey_table.max;
+ rdi->dparms.props.max_map_per_fmr = 32767;
+ rdi->dparms.props.max_pd = hfi1_max_pds;
+ rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
+ rdi->dparms.props.max_qp_init_rd_atom = 255;
+ rdi->dparms.props.max_srq = hfi1_max_srqs;
+ rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
+ rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
+ rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
+ rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
+ rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
+ rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
+ rdi->dparms.props.max_total_mcast_qp_attach =
+ rdi->dparms.props.max_mcast_qp_attach *
+ rdi->dparms.props.max_mcast_grp;
+}
+
+static inline u16 opa_speed_to_ib(u16 in)
+{
+ u16 out = 0;
+
+ if (in & OPA_LINK_SPEED_25G)
+ out |= IB_SPEED_EDR;
+ if (in & OPA_LINK_SPEED_12_5G)
+ out |= IB_SPEED_FDR;
+
+ return out;
+}
+
+/*
+ * Convert a single OPA link width (no multiple flags) to an IB value.
+ * A zero OPA link width means link down, which means the IB width value
+ * is a don't care.
+ */
+static inline u16 opa_width_to_ib(u16 in)
+{
+ switch (in) {
+ case OPA_LINK_WIDTH_1X:
+ /* map 2x and 3x to 1x as they don't exist in IB */
+ case OPA_LINK_WIDTH_2X:
+ case OPA_LINK_WIDTH_3X:
+ return IB_WIDTH_1X;
+ default: /* link down or unknown, return our largest width */
+ case OPA_LINK_WIDTH_4X:
+ return IB_WIDTH_4X;
+ }
+}
+
+static int query_port(struct rvt_dev_info *rdi, u8 port_num,
+ struct ib_port_attr *props)
+{
+ struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+ struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+ struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
+ u16 lid = ppd->lid;
+
+ props->lid = lid ? lid : 0;
+ props->lmc = ppd->lmc;
+ /* OPA logical states match IB logical states */
+ props->state = driver_lstate(ppd);
+ props->phys_state = hfi1_ibphys_portstate(ppd);
+ props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
+ props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
+ /* see rate_show() in ib core/sysfs.c */
+ props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
+ props->max_vl_num = ppd->vls_supported;
+
+ /* Once we are a "first class" citizen and have added the OPA MTUs to
+ * the core we can advertise the larger MTU enum to the ULPs, for now
+ * advertise only 4K.
+ *
+ * Those applications which are either OPA aware or pass the MTU enum
+ * from the Path Records to us will get the new 8k MTU. Those that
+ * attempt to process the MTU enum may fail in various ways.
+ */
+ props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
+ 4096 : hfi1_max_mtu), IB_MTU_4096);
+ props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
+ mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
+
+ return 0;
+}
+
+static int modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ struct hfi1_devdata *dd = dd_from_ibdev(device);
+ unsigned i;
+ int ret;
+
+ if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+ IB_DEVICE_MODIFY_NODE_DESC)) {
+ ret = -EOPNOTSUPP;
+ goto bail;
+ }
+
+ if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+ memcpy(device->node_desc, device_modify->node_desc, 64);
+ for (i = 0; i < dd->num_pports; i++) {
+ struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+ hfi1_node_desc_chg(ibp);
+ }
+ }
+
+ if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+ ib_hfi1_sys_image_guid =
+ cpu_to_be64(device_modify->sys_image_guid);
+ for (i = 0; i < dd->num_pports; i++) {
+ struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+ hfi1_sys_guid_chg(ibp);
+ }
+ }
+
+ ret = 0;
+
+bail:
+ return ret;
+}
+
+static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
+{
+ struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
+ struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
+ struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
+ int ret;
+
+ set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
+ OPA_LINKDOWN_REASON_UNKNOWN);
+ ret = set_link_state(ppd, HLS_DN_DOWNDEF);
+ return ret;
+}
+
+static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
+ int guid_index, __be64 *guid)
+{
+ struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+ if (guid_index == 0)
+ *guid = cpu_to_be64(ppd->guid);
+ else if (guid_index < HFI1_GUIDS_PER_PORT)
+ *guid = ibp->guids[guid_index - 1];
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * convert ah port,sl to sc
+ */
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
+{
+ struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
+
+ return ibp->sl_to_sc[ah->sl];
+}
+
+static int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
+{
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+ u8 sc5;
+
+ /* test the mapping for validity */
+ ibp = to_iport(ibdev, ah_attr->port_num);
+ ppd = ppd_from_ibp(ibp);
+ sc5 = ibp->sl_to_sc[ah_attr->sl];
+ dd = dd_from_ppd(ppd);
+ if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
+ return -EINVAL;
+ return 0;
+}
+
+static void hfi1_notify_new_ah(struct ib_device *ibdev,
+ struct ib_ah_attr *ah_attr,
+ struct rvt_ah *ah)
+{
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+ u8 sc5;
+
+ /*
+ * Do not trust reading anything from rvt_ah at this point as it is not
+ * done being setup. We can however modify things which we need to set.
+ */
+
+ ibp = to_iport(ibdev, ah_attr->port_num);
+ ppd = ppd_from_ibp(ibp);
+ sc5 = ibp->sl_to_sc[ah->attr.sl];
+ dd = dd_from_ppd(ppd);
+ ah->vl = sc_to_vlt(dd, sc5);
+ if (ah->vl < num_vls || ah->vl == 15)
+ ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
+}
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
+{
+ struct ib_ah_attr attr;
+ struct ib_ah *ah = ERR_PTR(-EINVAL);
+ struct rvt_qp *qp0;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.dlid = dlid;
+ attr.port_num = ppd_from_ibp(ibp)->port;
+ rcu_read_lock();
+ qp0 = rcu_dereference(ibp->rvp.qp[0]);
+ if (qp0)
+ ah = ib_create_ah(qp0->ibqp.pd, &attr);
+ rcu_read_unlock();
+ return ah;
+}
+
+/**
+ * hfi1_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the hfi1_ib device
+ */
+unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
+{
+ return ARRAY_SIZE(dd->pport[0].pkeys);
+}
+
+static void init_ibport(struct hfi1_pportdata *ppd)
+{
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
+ int i;
+
+ for (i = 0; i < sz; i++) {
+ ibp->sl_to_sc[i] = i;
+ ibp->sc_to_sl[i] = i;
+ }
+
+ spin_lock_init(&ibp->rvp.lock);
+ /* Set the prefix to the default value (see ch. 4.1.1) */
+ ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
+ ibp->rvp.sm_lid = 0;
+ /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
+ ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
+ IB_PORT_CAP_MASK_NOTICE_SUP;
+ ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+ ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+ ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+ ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+ ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+ RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
+ RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
+}
+
+static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str,
+ size_t str_len)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+ struct hfi1_ibdev *dev = dev_from_rdi(rdi);
+ u16 ver = dd_from_dev(dev)->dc8051_ver;
+
+ snprintf(str, str_len, "%u.%u", dc8051_ver_maj(ver),
+ dc8051_ver_min(ver));
+}
+
+/**
+ * hfi1_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return 0 if successful, errno if unsuccessful.
+ */
+int hfi1_register_ib_device(struct hfi1_devdata *dd)
+{
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+ struct ib_device *ibdev = &dev->rdi.ibdev;
+ struct hfi1_pportdata *ppd = dd->pport;
+ unsigned i;
+ int ret;
+ size_t lcpysz = IB_DEVICE_NAME_MAX;
+
+ for (i = 0; i < dd->num_pports; i++)
+ init_ibport(ppd + i);
+
+ /* Only need to initialize non-zero fields. */
+
+ setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev);
+
+ seqlock_init(&dev->iowait_lock);
+ INIT_LIST_HEAD(&dev->txwait);
+ INIT_LIST_HEAD(&dev->memwait);
+
+ ret = verbs_txreq_init(dev);
+ if (ret)
+ goto err_verbs_txreq;
+
+ /*
+ * The system image GUID is supposed to be the same for all
+ * HFIs in a single system but since there can be other
+ * device types in the system, we can't be sure this is unique.
+ */
+ if (!ib_hfi1_sys_image_guid)
+ ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
+ lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
+ strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
+ ibdev->owner = THIS_MODULE;
+ ibdev->node_guid = cpu_to_be64(ppd->guid);
+ ibdev->phys_port_cnt = dd->num_pports;
+ ibdev->dma_device = &dd->pcidev->dev;
+ ibdev->modify_device = modify_device;
+
+ /* keep process mad in the driver */
+ ibdev->process_mad = hfi1_process_mad;
+ ibdev->get_dev_fw_str = hfi1_get_dev_fw_str;
+
+ strncpy(ibdev->node_desc, init_utsname()->nodename,
+ sizeof(ibdev->node_desc));
+
+ /*
+ * Fill in rvt info object.
+ */
+ dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
+ dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name;
+ dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
+ dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
+ dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
+ dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
+ dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
+ dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
+ dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
+ /*
+ * Fill in rvt info device attributes.
+ */
+ hfi1_fill_device_attr(dd);
+
+ /* queue pair */
+ dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
+ dd->verbs_dev.rdi.dparms.qpn_start = 0;
+ dd->verbs_dev.rdi.dparms.qpn_inc = 1;
+ dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
+ dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
+ dd->verbs_dev.rdi.dparms.qpn_res_end =
+ dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
+ dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
+ dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
+ dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
+ dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
+ dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
+ dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
+
+ dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
+ dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
+ dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
+ dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
+ dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send;
+ dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
+ dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
+ dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
+ dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
+ dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
+ dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
+ dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
+ dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
+ dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
+ dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
+ dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
+ dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
+ dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+
+ /* completeion queue */
+ snprintf(dd->verbs_dev.rdi.dparms.cq_name,
+ sizeof(dd->verbs_dev.rdi.dparms.cq_name),
+ "hfi1_cq%d", dd->unit);
+ dd->verbs_dev.rdi.dparms.node = dd->node;
+
+ /* misc settings */
+ dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
+ dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
+ dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
+ dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
+
+ /* post send table */
+ dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
+
+ ppd = dd->pport;
+ for (i = 0; i < dd->num_pports; i++, ppd++)
+ rvt_init_port(&dd->verbs_dev.rdi,
+ &ppd->ibport_data.rvp,
+ i,
+ ppd->pkeys);
+
+ ret = rvt_register_device(&dd->verbs_dev.rdi);
+ if (ret)
+ goto err_verbs_txreq;
+
+ ret = hfi1_verbs_register_sysfs(dd);
+ if (ret)
+ goto err_class;
+
+ return ret;
+
+err_class:
+ rvt_unregister_device(&dd->verbs_dev.rdi);
+err_verbs_txreq:
+ verbs_txreq_exit(dev);
+ dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+ return ret;
+}
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
+{
+ struct hfi1_ibdev *dev = &dd->verbs_dev;
+
+ hfi1_verbs_unregister_sysfs(dd);
+
+ rvt_unregister_device(&dd->verbs_dev.rdi);
+
+ if (!list_empty(&dev->txwait))
+ dd_dev_err(dd, "txwait list not empty!\n");
+ if (!list_empty(&dev->memwait))
+ dd_dev_err(dd, "memwait list not empty!\n");
+
+ del_timer_sync(&dev->mem_timer);
+ verbs_txreq_exit(dev);
+}
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_ib_header *hdr = packet->hdr;
+ struct rvt_qp *qp = packet->qp;
+ u32 lqpn, rqpn = 0;
+ u16 rlid = 0;
+ u8 sl, sc5, svc_type;
+
+ switch (packet->qp->ibqp.qp_type) {
+ case IB_QPT_UC:
+ rlid = qp->remote_ah_attr.dlid;
+ rqpn = qp->remote_qpn;
+ svc_type = IB_CC_SVCTYPE_UC;
+ break;
+ case IB_QPT_RC:
+ rlid = qp->remote_ah_attr.dlid;
+ rqpn = qp->remote_qpn;
+ svc_type = IB_CC_SVCTYPE_RC;
+ break;
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_UD:
+ svc_type = IB_CC_SVCTYPE_UD;
+ break;
+ default:
+ ibp->rvp.n_pkt_drops++;
+ return;
+ }
+
+ sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf);
+ sl = ibp->sc_to_sl[sc5];
+ lqpn = qp->ibqp.qp_num;
+
+ process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
new file mode 100644
index 000000000000..d1b101c54828
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -0,0 +1,529 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_H
+#define HFI1_VERBS_H
+
+#include <linux/types.h>
+#include <linux/seqlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+#include <rdma/rdmavt_cq.h>
+
+struct hfi1_ctxtdata;
+struct hfi1_pportdata;
+struct hfi1_devdata;
+struct hfi1_packet;
+
+#include "iowait.h"
+
+#define HFI1_MAX_RDMA_ATOMIC 16
+#define HFI1_GUIDS_PER_PORT 5
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define HFI1_UVERBS_ABI_VERSION 2
+
+#define IB_SEQ_NAK (3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK 0x20
+#define IB_NAK_PSN_ERROR 0x60
+#define IB_NAK_INVALID_REQUEST 0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR 0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST 0x64
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE 0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED 0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005)
+
+#define HFI1_VENDOR_IPG cpu_to_be16(0xFFA0)
+
+#define IB_BTH_REQ_ACK BIT(31)
+#define IB_BTH_SOLICITED BIT(23)
+#define IB_BTH_MIG_REQ BIT(22)
+
+#define IB_GRH_VERSION 6
+#define IB_GRH_VERSION_MASK 0xF
+#define IB_GRH_VERSION_SHIFT 28
+#define IB_GRH_TCLASS_MASK 0xFF
+#define IB_GRH_TCLASS_SHIFT 20
+#define IB_GRH_FLOW_MASK 0xFFFFF
+#define IB_GRH_FLOW_SHIFT 0
+#define IB_GRH_NEXT_HDR 0x1B
+
+#define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL)
+
+/* flags passed by hfi1_ib_rcv() */
+enum {
+ HFI1_HAS_GRH = (1 << 0),
+};
+
+struct ib_reth {
+ __be64 vaddr;
+ __be32 rkey;
+ __be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+ __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */
+ __be32 rkey;
+ __be64 swap_data;
+ __be64 compare_data;
+} __packed;
+
+union ib_ehdrs {
+ struct {
+ __be32 deth[2];
+ __be32 imm_data;
+ } ud;
+ struct {
+ struct ib_reth reth;
+ __be32 imm_data;
+ } rc;
+ struct {
+ __be32 aeth;
+ __be32 atomic_ack_eth[2];
+ } at;
+ __be32 imm_data;
+ __be32 aeth;
+ __be32 ieth;
+ struct ib_atomic_eth atomic_eth;
+} __packed;
+
+struct hfi1_other_headers {
+ __be32 bth[3];
+ union ib_ehdrs u;
+} __packed;
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data). Only the first 56 bytes of the IB header
+ * will be in the eager header buffer. The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct hfi1_ib_header {
+ __be16 lrh[4];
+ union {
+ struct {
+ struct ib_grh grh;
+ struct hfi1_other_headers oth;
+ } l;
+ struct hfi1_other_headers oth;
+ } u;
+} __packed;
+
+struct hfi1_ahg_info {
+ u32 ahgdesc[2];
+ u16 tx_flags;
+ u8 ahgcount;
+ u8 ahgidx;
+};
+
+struct hfi1_sdma_header {
+ __le64 pbc;
+ struct hfi1_ib_header hdr;
+} __packed;
+
+/*
+ * hfi1 specific data structures that will be hidden from rvt after the queue
+ * pair is made common
+ */
+struct hfi1_qp_priv {
+ struct hfi1_ahg_info *s_ahg; /* ahg info for next header */
+ struct sdma_engine *s_sde; /* current sde */
+ struct send_context *s_sendcontext; /* current sendcontext */
+ u8 s_sc; /* SC[0..4] for next packet */
+ u8 r_adefered; /* number of acks defered */
+ struct iowait s_iowait;
+ struct timer_list s_rnr_timer;
+ struct rvt_qp *owner;
+};
+
+/*
+ * This structure is used to hold commonly lookedup and computed values during
+ * the send engine progress.
+ */
+struct hfi1_pkt_state {
+ struct hfi1_ibdev *dev;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct verbs_txreq *s_txreq;
+ unsigned long flags;
+};
+
+#define HFI1_PSN_CREDIT 16
+
+struct hfi1_opcode_stats {
+ u64 n_packets; /* number of packets */
+ u64 n_bytes; /* total number of bytes */
+};
+
+struct hfi1_opcode_stats_perctx {
+ struct hfi1_opcode_stats stats[256];
+};
+
+static inline void inc_opstats(
+ u32 tlen,
+ struct hfi1_opcode_stats *stats)
+{
+#ifdef CONFIG_DEBUG_FS
+ stats->n_bytes += tlen;
+ stats->n_packets++;
+#endif
+}
+
+struct hfi1_ibport {
+ struct rvt_qp __rcu *qp[2];
+ struct rvt_ibport rvp;
+
+ __be64 guids[HFI1_GUIDS_PER_PORT - 1]; /* writable GUIDs */
+
+ /* the first 16 entries are sl_to_vl for !OPA */
+ u8 sl_to_sc[32];
+ u8 sc_to_sl[32];
+};
+
+struct hfi1_ibdev {
+ struct rvt_dev_info rdi; /* Must be first */
+
+ /* QP numbers are shared by all IB ports */
+ /* protect wait lists */
+ seqlock_t iowait_lock;
+ struct list_head txwait; /* list for wait verbs_txreq */
+ struct list_head memwait; /* list for wait kernel memory */
+ struct list_head txreq_free;
+ struct kmem_cache *verbs_txreq_cache;
+ struct timer_list mem_timer;
+
+ u64 n_piowait;
+ u64 n_piodrain;
+ u64 n_txwait;
+ u64 n_kmem_wait;
+
+#ifdef CONFIG_DEBUG_FS
+ /* per HFI debugfs */
+ struct dentry *hfi1_ibdev_dbg;
+ /* per HFI symlinks to above */
+ struct dentry *hfi1_ibdev_link;
+#endif
+};
+
+static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
+{
+ struct rvt_dev_info *rdi;
+
+ rdi = container_of(ibdev, struct rvt_dev_info, ibdev);
+ return container_of(rdi, struct hfi1_ibdev, rdi);
+}
+
+static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait)
+{
+ struct hfi1_qp_priv *priv;
+
+ priv = container_of(s_iowait, struct hfi1_qp_priv, s_iowait);
+ return priv->owner;
+}
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct rvt_qp *qp)
+{
+ return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
+ (qp->s_hdrwords || (qp->s_flags & RVT_S_RESP_PENDING) ||
+ !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+ u32 qp1, u32 qp2, u16 lid1, u16 lid2);
+void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+ struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+ u16 *out_mad_pkey_index);
+
+/*
+ * The PSN_MASK and PSN_SHIFT allow for
+ * 1) comparing two PSNs
+ * 2) returning the PSN with any upper bits masked
+ * 3) returning the difference between to PSNs
+ *
+ * The number of significant bits in the PSN must
+ * necessarily be at least one bit less than
+ * the container holding the PSN.
+ */
+#ifndef CONFIG_HFI1_VERBS_31BIT_PSN
+#define PSN_MASK 0xFFFFFF
+#define PSN_SHIFT 8
+#else
+#define PSN_MASK 0x7FFFFFFF
+#define PSN_SHIFT 1
+#endif
+#define PSN_MODIFY_MASK 0xFFFFFF
+
+/*
+ * Compare the lower 24 bits of the msn values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_msn(u32 a, u32 b)
+{
+ return (((int)a) - ((int)b)) << 8;
+}
+
+/*
+ * Compare two PSNs
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_psn(u32 a, u32 b)
+{
+ return (((int)a) - ((int)b)) << PSN_SHIFT;
+}
+
+/*
+ * Return masked PSN
+ */
+static inline u32 mask_psn(u32 a)
+{
+ return a & PSN_MASK;
+}
+
+/*
+ * Return delta between two PSNs
+ */
+static inline u32 delta_psn(u32 a, u32 b)
+{
+ return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
+}
+
+struct verbs_txreq;
+void hfi1_put_txreq(struct verbs_txreq *tx);
+
+int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
+ int release, int copy_last);
+
+void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release);
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet);
+
+void hfi1_uc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_hdrerr(
+ struct hfi1_ctxtdata *rcd,
+ struct hfi1_ib_header *hdr,
+ u32 rcv_flags,
+ struct rvt_qp *qp);
+
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
+
+void hfi1_rc_rnr_retry(unsigned long arg);
+void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to);
+void hfi1_rc_timeout(unsigned long arg);
+void hfi1_del_timers_sync(struct rvt_qp *qp);
+void hfi1_stop_rc_timers(struct rvt_qp *qp);
+
+void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr);
+
+void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
+
+void hfi1_ud_rcv(struct hfi1_packet *packet);
+
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
+
+int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only);
+
+void hfi1_migrate_qp(struct rvt_qp *qp);
+
+int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+
+void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+
+int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+extern const u32 rc_only_opcode;
+extern const u32 uc_only_opcode;
+
+static inline u8 get_opcode(struct hfi1_ib_header *h)
+{
+ u16 lnh = be16_to_cpu(h->lrh[0]) & 3;
+
+ if (lnh == IB_LNH_IBA_LOCAL)
+ return be32_to_cpu(h->u.oth.bth[0]) >> 24;
+ else
+ return be32_to_cpu(h->u.l.oth.bth[0]) >> 24;
+}
+
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+ int has_grh, struct rvt_qp *qp, u32 bth0);
+
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+ struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
+ u32 bth0, u32 bth2, int middle,
+ struct hfi1_pkt_state *ps);
+
+void _hfi1_do_send(struct work_struct *work);
+
+void hfi1_do_send(struct rvt_qp *qp);
+
+void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ enum ib_wc_status status);
+
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct rvt_qp *qp, int is_fecn);
+
+int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+int hfi1_register_ib_device(struct hfi1_devdata *);
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *);
+
+void hfi1_ib_rcv(struct hfi1_packet *packet);
+
+unsigned hfi1_get_npkeys(struct hfi1_devdata *);
+
+int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ u64 pbc);
+
+int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ u64 pbc);
+
+int hfi1_wss_init(void);
+void hfi1_wss_exit(void);
+
+/* platform specific: return the lowest level cache (llc) size, in KiB */
+static inline int wss_llc_size(void)
+{
+ /* assume that the boot CPU value is universal for all CPUs */
+ return boot_cpu_data.x86_cache_size;
+}
+
+/* platform specific: cacheless copy */
+static inline void cacheless_memcpy(void *dst, void *src, size_t n)
+{
+ /*
+ * Use the only available X64 cacheless copy. Add a __user cast
+ * to quiet sparse. The src agument is already in the kernel so
+ * there are no security issues. The extra fault recovery machinery
+ * is not invoked.
+ */
+ __copy_user_nocache(dst, (void __user *)src, n, 0);
+}
+
+extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
+
+extern const u8 hdr_len_by_opcode[];
+
+extern const int ib_rvt_state_ops[];
+
+extern __be64 ib_hfi1_sys_image_guid; /* in network order */
+
+extern unsigned int hfi1_max_cqes;
+
+extern unsigned int hfi1_max_cqs;
+
+extern unsigned int hfi1_max_qp_wrs;
+
+extern unsigned int hfi1_max_qps;
+
+extern unsigned int hfi1_max_sges;
+
+extern unsigned int hfi1_max_mcast_grps;
+
+extern unsigned int hfi1_max_mcast_qp_attached;
+
+extern unsigned int hfi1_max_srqs;
+
+extern unsigned int hfi1_max_srq_sges;
+
+extern unsigned int hfi1_max_srq_wrs;
+
+extern unsigned short piothreshold;
+
+extern const u32 ib_hfi1_rnr_table[];
+
+#endif /* HFI1_VERBS_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c
new file mode 100644
index 000000000000..d8fb056526f8
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "verbs_txreq.h"
+#include "qp.h"
+#include "trace.h"
+
+#define TXREQ_LEN 24
+
+void hfi1_put_txreq(struct verbs_txreq *tx)
+{
+ struct hfi1_ibdev *dev;
+ struct rvt_qp *qp;
+ unsigned long flags;
+ unsigned int seq;
+ struct hfi1_qp_priv *priv;
+
+ qp = tx->qp;
+ dev = to_idev(qp->ibqp.device);
+
+ if (tx->mr)
+ rvt_put_mr(tx->mr);
+
+ sdma_txclean(dd_from_dev(dev), &tx->txreq);
+
+ /* Free verbs_txreq and return to slab cache */
+ kmem_cache_free(dev->verbs_txreq_cache, tx);
+
+ do {
+ seq = read_seqbegin(&dev->iowait_lock);
+ if (!list_empty(&dev->txwait)) {
+ struct iowait *wait;
+
+ write_seqlock_irqsave(&dev->iowait_lock, flags);
+ wait = list_first_entry(&dev->txwait, struct iowait,
+ list);
+ qp = iowait_to_qp(wait);
+ priv = qp->priv;
+ list_del_init(&priv->s_iowait.list);
+ /* refcount held until actual wake up */
+ write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+ hfi1_qp_wakeup(qp, RVT_S_WAIT_TX);
+ break;
+ }
+ } while (read_seqretry(&dev->iowait_lock, seq));
+}
+
+struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+ struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ struct verbs_txreq *tx = ERR_PTR(-EBUSY);
+
+ write_seqlock(&dev->iowait_lock);
+ if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
+ struct hfi1_qp_priv *priv;
+
+ tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+ if (tx)
+ goto out;
+ priv = qp->priv;
+ if (list_empty(&priv->s_iowait.list)) {
+ dev->n_txwait++;
+ qp->s_flags |= RVT_S_WAIT_TX;
+ list_add_tail(&priv->s_iowait.list, &dev->txwait);
+ trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX);
+ atomic_inc(&qp->refcount);
+ }
+ qp->s_flags &= ~RVT_S_BUSY;
+ }
+out:
+ write_sequnlock(&dev->iowait_lock);
+ return tx;
+}
+
+static void verbs_txreq_kmem_cache_ctor(void *obj)
+{
+ struct verbs_txreq *tx = (struct verbs_txreq *)obj;
+
+ memset(tx, 0, sizeof(*tx));
+}
+
+int verbs_txreq_init(struct hfi1_ibdev *dev)
+{
+ char buf[TXREQ_LEN];
+ struct hfi1_devdata *dd = dd_from_dev(dev);
+
+ snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit);
+ dev->verbs_txreq_cache = kmem_cache_create(buf,
+ sizeof(struct verbs_txreq),
+ 0, SLAB_HWCACHE_ALIGN,
+ verbs_txreq_kmem_cache_ctor);
+ if (!dev->verbs_txreq_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void verbs_txreq_exit(struct hfi1_ibdev *dev)
+{
+ kmem_cache_destroy(dev->verbs_txreq_cache);
+ dev->verbs_txreq_cache = NULL;
+}
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
new file mode 100644
index 000000000000..5660897593ba
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_TXREQ_H
+#define HFI1_VERBS_TXREQ_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include "verbs.h"
+#include "sdma_txreq.h"
+#include "iowait.h"
+
+struct verbs_txreq {
+ struct hfi1_sdma_header phdr;
+ struct sdma_txreq txreq;
+ struct rvt_qp *qp;
+ struct rvt_swqe *wqe;
+ struct rvt_mregion *mr;
+ struct rvt_sge_state *ss;
+ struct sdma_engine *sde;
+ struct send_context *psc;
+ u16 hdr_dwords;
+};
+
+struct hfi1_ibdev;
+struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+ struct rvt_qp *qp);
+
+static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
+ struct rvt_qp *qp)
+ __must_hold(&qp->slock)
+{
+ struct verbs_txreq *tx;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+ if (unlikely(!tx)) {
+ /* call slow path to get the lock */
+ tx = __get_txreq(dev, qp);
+ if (IS_ERR(tx))
+ return tx;
+ }
+ tx->qp = qp;
+ tx->mr = NULL;
+ tx->sde = priv->s_sde;
+ tx->psc = priv->s_sendcontext;
+ /* so that we can test if the sdma decriptors are there */
+ tx->txreq.num_desc = 0;
+ return tx;
+}
+
+static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx)
+{
+ return &tx->txreq;
+}
+
+static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
+{
+ struct sdma_txreq *stx;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ stx = iowait_get_txhead(&priv->s_iowait);
+ if (stx)
+ return container_of(stx, struct verbs_txreq, txreq);
+ return NULL;
+}
+
+void hfi1_put_txreq(struct verbs_txreq *tx);
+int verbs_txreq_init(struct hfi1_ibdev *dev);
+void verbs_txreq_exit(struct hfi1_ibdev *dev);
+
+#endif /* HFI1_VERBS_TXREQ_H */
diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h
index 819767681445..8ec09e470f84 100644
--- a/drivers/infiniband/hw/i40iw/i40iw.h
+++ b/drivers/infiniband/hw/i40iw/i40iw.h
@@ -50,8 +50,6 @@
#include <rdma/ib_pack.h>
#include <rdma/rdma_cm.h>
#include <rdma/iw_cm.h>
-#include <rdma/iw_portmap.h>
-#include <rdma/rdma_netlink.h>
#include <crypto/hash.h>
#include "i40iw_status.h"
@@ -115,6 +113,8 @@
#define IW_HMC_OBJ_TYPE_NUM ARRAY_SIZE(iw_hmc_obj_types)
#define IW_CFG_FPM_QP_COUNT 32768
+#define I40IW_MAX_PAGES_PER_FMR 512
+#define I40IW_MIN_PAGES_PER_FMR 1
#define I40IW_MTU_TO_MSS 40
#define I40IW_DEFAULT_MSS 1460
@@ -232,7 +232,7 @@ struct i40iw_device {
struct i40e_client *client;
struct i40iw_hw hw;
struct i40iw_cm_core cm_core;
- unsigned long *mem_resources;
+ u8 *mem_resources;
unsigned long *allocated_qps;
unsigned long *allocated_cqs;
unsigned long *allocated_mrs;
@@ -254,6 +254,7 @@ struct i40iw_device {
u32 arp_table_size;
u32 next_arp_index;
spinlock_t resource_lock; /* hw resource access */
+ spinlock_t qptable_lock;
u32 vendor_id;
u32 vendor_part_id;
u32 of_device_registered;
@@ -392,7 +393,7 @@ void i40iw_flush_wqes(struct i40iw_device *iwdev,
void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
unsigned char *mac_addr,
- __be32 *ip_addr,
+ u32 *ip_addr,
bool ipv4,
u32 action);
@@ -434,8 +435,8 @@ static inline int i40iw_alloc_resource(struct i40iw_device *iwdev,
*next = resource_num + 1;
if (*next == max_resources)
*next = 0;
- spin_unlock_irqrestore(&iwdev->resource_lock, flags);
*req_resource_num = resource_num;
+ spin_unlock_irqrestore(&iwdev->resource_lock, flags);
return 0;
}
@@ -550,7 +551,7 @@ enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev,
struct i40iw_qp_flush_info *info,
bool wait);
-void i40iw_copy_ip_ntohl(u32 *dst, u32 *src);
+void i40iw_copy_ip_ntohl(u32 *dst, __be32 *src);
struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *ib_pd,
u64 addr,
u64 size,
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 38f917a6c778..7ca0638579c0 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -535,8 +535,8 @@ static struct i40iw_puda_buf *i40iw_form_cm_frame(struct i40iw_cm_node *cm_node,
buf += hdr_len;
}
- if (pd_len)
- memcpy(buf, pdata->addr, pd_len);
+ if (pdata && pdata->addr)
+ memcpy(buf, pdata->addr, pdata->size);
atomic_set(&sqbuf->refcount, 1);
@@ -771,6 +771,7 @@ static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node,
{
struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr;
struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg;
+ u16 ctrl_ird, ctrl_ord;
/* initialize the upper 5 bytes of the frame */
i40iw_build_mpa_v1(cm_node, start_addr, mpa_key);
@@ -779,38 +780,38 @@ static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node,
/* initialize RTR msg */
if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
- rtr_msg->ctrl_ird = IETF_NO_IRD_ORD;
- rtr_msg->ctrl_ord = IETF_NO_IRD_ORD;
+ ctrl_ird = IETF_NO_IRD_ORD;
+ ctrl_ord = IETF_NO_IRD_ORD;
} else {
- rtr_msg->ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ?
+ ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ?
IETF_NO_IRD_ORD : cm_node->ird_size;
- rtr_msg->ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ?
+ ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ?
IETF_NO_IRD_ORD : cm_node->ord_size;
}
- rtr_msg->ctrl_ird |= IETF_PEER_TO_PEER;
- rtr_msg->ctrl_ird |= IETF_FLPDU_ZERO_LEN;
+ ctrl_ird |= IETF_PEER_TO_PEER;
+ ctrl_ird |= IETF_FLPDU_ZERO_LEN;
switch (mpa_key) {
case MPA_KEY_REQUEST:
- rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE;
- rtr_msg->ctrl_ord |= IETF_RDMA0_READ;
+ ctrl_ord |= IETF_RDMA0_WRITE;
+ ctrl_ord |= IETF_RDMA0_READ;
break;
case MPA_KEY_REPLY:
switch (cm_node->send_rdma0_op) {
case SEND_RDMA_WRITE_ZERO:
- rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE;
+ ctrl_ord |= IETF_RDMA0_WRITE;
break;
case SEND_RDMA_READ_ZERO:
- rtr_msg->ctrl_ord |= IETF_RDMA0_READ;
+ ctrl_ord |= IETF_RDMA0_READ;
break;
}
break;
default:
break;
}
- rtr_msg->ctrl_ird = htons(rtr_msg->ctrl_ird);
- rtr_msg->ctrl_ord = htons(rtr_msg->ctrl_ord);
+ rtr_msg->ctrl_ird = htons(ctrl_ird);
+ rtr_msg->ctrl_ord = htons(ctrl_ord);
}
/**
@@ -1566,12 +1567,12 @@ static enum i40iw_status_code i40iw_del_multiple_qhash(
ret = i40iw_manage_qhash(iwdev, cm_info,
I40IW_QHASH_TYPE_TCP_SYN,
I40IW_QHASH_MANAGE_TYPE_DELETE, NULL, false);
- kfree(child_listen_node);
- cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++;
i40iw_debug(&iwdev->sc_dev,
I40IW_DEBUG_CM,
"freed pointer = %p\n",
child_listen_node);
+ kfree(child_listen_node);
+ cm_parent_listen_node->cm_core->stats_listen_nodes_destroyed++;
}
spin_unlock_irqrestore(&iwdev->cm_core.listen_list_lock, flags);
@@ -2107,7 +2108,7 @@ static bool i40iw_ipv6_is_loopback(u32 *loc_addr, u32 *rem_addr)
struct in6_addr raddr6;
i40iw_copy_ip_htonl(raddr6.in6_u.u6_addr32, rem_addr);
- return (!memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6));
+ return !memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6);
}
/**
@@ -2160,7 +2161,7 @@ static struct i40iw_cm_node *i40iw_make_cm_node(
cm_node->tcp_cntxt.rcv_wnd =
I40IW_CM_DEFAULT_RCV_WND_SCALED >> I40IW_CM_DEFAULT_RCV_WND_SCALE;
ts = current_kernel_time();
- cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec);
+ cm_node->tcp_cntxt.loc_seq_num = ts.tv_nsec;
cm_node->tcp_cntxt.mss = iwdev->mss;
cm_node->iwdev = iwdev;
@@ -2234,7 +2235,7 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node)
if (cm_node->listener) {
i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true);
} else {
- if (!i40iw_listen_port_in_use(cm_core, htons(cm_node->loc_port)) &&
+ if (!i40iw_listen_port_in_use(cm_core, cm_node->loc_port) &&
cm_node->apbvt_set && cm_node->iwdev) {
i40iw_manage_apbvt(cm_node->iwdev,
cm_node->loc_port,
@@ -2852,7 +2853,6 @@ static struct i40iw_cm_node *i40iw_create_cm_node(
void *private_data,
struct i40iw_cm_info *cm_info)
{
- int ret;
struct i40iw_cm_node *cm_node;
struct i40iw_cm_listener *loopback_remotelistener;
struct i40iw_cm_node *loopback_remotenode;
@@ -2922,30 +2922,6 @@ static struct i40iw_cm_node *i40iw_create_cm_node(
memcpy(cm_node->pdata_buf, private_data, private_data_len);
cm_node->state = I40IW_CM_STATE_SYN_SENT;
- ret = i40iw_send_syn(cm_node, 0);
-
- if (ret) {
- if (cm_node->ipv4)
- i40iw_debug(cm_node->dev,
- I40IW_DEBUG_CM,
- "Api - connect() FAILED: dest addr=%pI4",
- cm_node->rem_addr);
- else
- i40iw_debug(cm_node->dev, I40IW_DEBUG_CM,
- "Api - connect() FAILED: dest addr=%pI6",
- cm_node->rem_addr);
- i40iw_rem_ref_cm_node(cm_node);
- cm_node = NULL;
- }
-
- if (cm_node)
- i40iw_debug(cm_node->dev,
- I40IW_DEBUG_CM,
- "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n",
- cm_node->rem_port,
- cm_node,
- cm_node->cm_id);
-
return cm_node;
}
@@ -3266,11 +3242,13 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node,
tcp_info->dest_ip_addr3 = cpu_to_le32(cm_node->rem_addr[0]);
tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[0]);
- tcp_info->arp_idx = cpu_to_le32(i40iw_arp_table(iwqp->iwdev,
- &tcp_info->dest_ip_addr3,
- true,
- NULL,
- I40IW_ARP_RESOLVE));
+ tcp_info->arp_idx =
+ cpu_to_le16((u16)i40iw_arp_table(
+ iwqp->iwdev,
+ &tcp_info->dest_ip_addr3,
+ true,
+ NULL,
+ I40IW_ARP_RESOLVE));
} else {
tcp_info->src_port = cpu_to_le16(cm_node->loc_port);
tcp_info->dst_port = cpu_to_le16(cm_node->rem_port);
@@ -3282,12 +3260,13 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node,
tcp_info->local_ipaddr1 = cpu_to_le32(cm_node->loc_addr[1]);
tcp_info->local_ipaddr2 = cpu_to_le32(cm_node->loc_addr[2]);
tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[3]);
- tcp_info->arp_idx = cpu_to_le32(i40iw_arp_table(
- iwqp->iwdev,
- &tcp_info->dest_ip_addr0,
- false,
- NULL,
- I40IW_ARP_RESOLVE));
+ tcp_info->arp_idx =
+ cpu_to_le16((u16)i40iw_arp_table(
+ iwqp->iwdev,
+ &tcp_info->dest_ip_addr0,
+ false,
+ NULL,
+ I40IW_ARP_RESOLVE));
}
}
@@ -3368,26 +3347,6 @@ int i40iw_cm_disconn(struct i40iw_qp *iwqp)
}
/**
- * i40iw_loopback_nop - Send a nop
- * @qp: associated hw qp
- */
-static void i40iw_loopback_nop(struct i40iw_sc_qp *qp)
-{
- u64 *wqe;
- u64 header;
-
- wqe = qp->qp_uk.sq_base->elem;
- set_64bit_val(wqe, 0, 0);
- set_64bit_val(wqe, 8, 0);
- set_64bit_val(wqe, 16, 0);
-
- header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) |
- LS_64(0, I40IWQPSQ_SIGCOMPL) |
- LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
- set_64bit_val(wqe, 24, header);
-}
-
-/**
* i40iw_qp_disconnect - free qp and close cm
* @iwqp: associate qp for the connection
*/
@@ -3564,7 +3523,6 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
struct i40iw_cm_node *cm_node;
struct ib_qp_attr attr;
int passive_state;
- struct i40iw_ib_device *iwibdev;
struct ib_mr *ibmr;
struct i40iw_pd *iwpd;
u16 buf_len = 0;
@@ -3627,7 +3585,6 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
!i40iw_ipv4_is_loopback(cm_node->loc_addr[0], cm_node->rem_addr[0])) ||
(!cm_node->ipv4 &&
!i40iw_ipv6_is_loopback(cm_node->loc_addr, cm_node->rem_addr))) {
- iwibdev = iwdev->iwibdev;
iwpd = iwqp->iwpd;
tagged_offset = (uintptr_t)iwqp->ietf_mem.va;
ibmr = i40iw_reg_phys_mr(&iwpd->ibpd,
@@ -3661,7 +3618,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
} else {
if (iwqp->page)
iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
- i40iw_loopback_nop(&iwqp->sc_qp);
+ dev->iw_priv_qp_ops->qp_send_lsmm(&iwqp->sc_qp, NULL, 0, 0);
}
if (iwqp->page)
@@ -3752,6 +3709,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
struct sockaddr_in *raddr;
struct sockaddr_in6 *laddr6;
struct sockaddr_in6 *raddr6;
+ bool qhash_set = false;
int apbvt_set = 0;
enum i40iw_status_code status;
@@ -3810,6 +3768,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
true);
if (status)
return -EINVAL;
+ qhash_set = true;
}
status = i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD);
if (status) {
@@ -3828,23 +3787,8 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
conn_param->private_data_len,
(void *)conn_param->private_data,
&cm_info);
- if (!cm_node) {
- i40iw_manage_qhash(iwdev,
- &cm_info,
- I40IW_QHASH_TYPE_TCP_ESTABLISHED,
- I40IW_QHASH_MANAGE_TYPE_DELETE,
- NULL,
- false);
-
- if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core,
- cm_info.loc_port))
- i40iw_manage_apbvt(iwdev,
- cm_info.loc_port,
- I40IW_MANAGE_APBVT_DEL);
- cm_id->rem_ref(cm_id);
- iwdev->cm_core.stats_connect_errs++;
- return -ENOMEM;
- }
+ if (!cm_node)
+ goto err;
i40iw_record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO &&
@@ -3852,12 +3796,54 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
cm_node->ord_size = 1;
cm_node->apbvt_set = apbvt_set;
- cm_node->qhash_set = true;
+ cm_node->qhash_set = qhash_set;
iwqp->cm_node = cm_node;
cm_node->iwqp = iwqp;
iwqp->cm_id = cm_id;
i40iw_add_ref(&iwqp->ibqp);
+
+ if (cm_node->state == I40IW_CM_STATE_SYN_SENT) {
+ if (i40iw_send_syn(cm_node, 0)) {
+ i40iw_rem_ref_cm_node(cm_node);
+ goto err;
+ }
+ }
+
+ i40iw_debug(cm_node->dev,
+ I40IW_DEBUG_CM,
+ "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n",
+ cm_node->rem_port,
+ cm_node,
+ cm_node->cm_id);
return 0;
+
+err:
+ if (cm_node) {
+ if (cm_node->ipv4)
+ i40iw_debug(cm_node->dev,
+ I40IW_DEBUG_CM,
+ "Api - connect() FAILED: dest addr=%pI4",
+ cm_node->rem_addr);
+ else
+ i40iw_debug(cm_node->dev, I40IW_DEBUG_CM,
+ "Api - connect() FAILED: dest addr=%pI6",
+ cm_node->rem_addr);
+ }
+ i40iw_manage_qhash(iwdev,
+ &cm_info,
+ I40IW_QHASH_TYPE_TCP_ESTABLISHED,
+ I40IW_QHASH_MANAGE_TYPE_DELETE,
+ NULL,
+ false);
+
+ if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core,
+ cm_info.loc_port))
+ i40iw_manage_apbvt(iwdev,
+ cm_info.loc_port,
+ I40IW_MANAGE_APBVT_DEL);
+ cm_id->rem_ref(cm_id);
+ iwdev->cm_core.stats_connect_errs++;
+ return -ENOMEM;
}
/**
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h
index 5f8ceb4a8e84..e9046d9f9645 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h
@@ -1,6 +1,6 @@
/*******************************************************************************
*
-* Copyright (c) 2015 Intel Corporation. All rights reserved.
+* Copyright (c) 2015-2016 Intel Corporation. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -291,8 +291,6 @@ struct i40iw_cm_listener {
u8 loc_mac[ETH_ALEN];
u32 loc_addr[4];
u16 loc_port;
- u32 map_loc_addr[4];
- u16 map_loc_port;
struct iw_cm_id *cm_id;
atomic_t ref_count;
struct i40iw_device *iwdev;
@@ -317,8 +315,6 @@ struct i40iw_kmem_info {
struct i40iw_cm_node {
u32 loc_addr[4], rem_addr[4];
u16 loc_port, rem_port;
- u32 map_loc_addr[4], map_rem_addr[4];
- u16 map_loc_port, map_rem_port;
u16 vlan_id;
enum i40iw_cm_node_state state;
u8 loc_mac[ETH_ALEN];
@@ -370,10 +366,6 @@ struct i40iw_cm_info {
u16 rem_port;
u32 loc_addr[4];
u32 rem_addr[4];
- u16 map_loc_port;
- u16 map_rem_port;
- u32 map_loc_addr[4];
- u32 map_rem_addr[4];
u16 vlan_id;
int backlog;
u16 user_pri;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
index f05802bf6ca0..2c4b4d072d6a 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
@@ -114,16 +114,21 @@ static enum i40iw_status_code i40iw_cqp_poll_registers(
* i40iw_sc_parse_fpm_commit_buf - parse fpm commit buffer
* @buf: ptr to fpm commit buffer
* @info: ptr to i40iw_hmc_obj_info struct
+ * @sd: number of SDs for HMC objects
*
* parses fpm commit info and copy base value
* of hmc objects in hmc_info
*/
static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf(
u64 *buf,
- struct i40iw_hmc_obj_info *info)
+ struct i40iw_hmc_obj_info *info,
+ u32 *sd)
{
u64 temp;
+ u64 size;
+ u64 base = 0;
u32 i, j;
+ u32 k = 0;
u32 low;
/* copy base values in obj_info */
@@ -131,10 +136,20 @@ static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf(
i <= I40IW_HMC_IW_PBLE; i++, j += 8) {
get_64bit_val(buf, j, &temp);
info[i].base = RS_64_1(temp, 32) * 512;
+ if (info[i].base > base) {
+ base = info[i].base;
+ k = i;
+ }
low = (u32)(temp);
if (low)
info[i].cnt = low;
}
+ size = info[k].cnt * info[k].size + info[k].base;
+ if (size & 0x1FFFFF)
+ *sd = (u32)((size >> 21) + 1); /* add 1 for remainder */
+ else
+ *sd = (u32)(size >> 21);
+
return 0;
}
@@ -2909,6 +2924,65 @@ static enum i40iw_status_code i40iw_sc_mw_alloc(
}
/**
+ * i40iw_sc_mr_fast_register - Posts RDMA fast register mr WR to iwarp qp
+ * @qp: sc qp struct
+ * @info: fast mr info
+ * @post_sq: flag for cqp db to ring
+ */
+enum i40iw_status_code i40iw_sc_mr_fast_register(
+ struct i40iw_sc_qp *qp,
+ struct i40iw_fast_reg_stag_info *info,
+ bool post_sq)
+{
+ u64 temp, header;
+ u64 *wqe;
+ u32 wqe_idx;
+
+ wqe = i40iw_qp_get_next_send_wqe(&qp->qp_uk, &wqe_idx, I40IW_QP_WQE_MIN_SIZE,
+ 0, info->wr_id);
+ if (!wqe)
+ return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
+
+ i40iw_debug(qp->dev, I40IW_DEBUG_MR, "%s: wr_id[%llxh] wqe_idx[%04d] location[%p]\n",
+ __func__, info->wr_id, wqe_idx,
+ &qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid);
+ temp = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? (uintptr_t)info->va : info->fbo;
+ set_64bit_val(wqe, 0, temp);
+
+ temp = RS_64(info->first_pm_pbl_index >> 16, I40IWQPSQ_FIRSTPMPBLIDXHI);
+ set_64bit_val(wqe,
+ 8,
+ LS_64(temp, I40IWQPSQ_FIRSTPMPBLIDXHI) |
+ LS_64(info->reg_addr_pa >> I40IWQPSQ_PBLADDR_SHIFT, I40IWQPSQ_PBLADDR));
+
+ set_64bit_val(wqe,
+ 16,
+ info->total_len |
+ LS_64(info->first_pm_pbl_index, I40IWQPSQ_FIRSTPMPBLIDXLO));
+
+ header = LS_64(info->stag_key, I40IWQPSQ_STAGKEY) |
+ LS_64(info->stag_idx, I40IWQPSQ_STAGINDEX) |
+ LS_64(I40IWQP_OP_FAST_REGISTER, I40IWQPSQ_OPCODE) |
+ LS_64(info->chunk_size, I40IWQPSQ_LPBLSIZE) |
+ LS_64(info->page_size, I40IWQPSQ_HPAGESIZE) |
+ LS_64(info->access_rights, I40IWQPSQ_STAGRIGHTS) |
+ LS_64(info->addr_type, I40IWQPSQ_VABASEDTO) |
+ LS_64(info->read_fence, I40IWQPSQ_READFENCE) |
+ LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) |
+ LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) |
+ LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID);
+
+ i40iw_insert_wqe_hdr(wqe, header);
+
+ i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "FAST_REG WQE",
+ wqe, I40IW_QP_WQE_MIN_SIZE);
+
+ if (post_sq)
+ i40iw_qp_post_wr(&qp->qp_uk);
+ return 0;
+}
+
+/**
* i40iw_sc_send_lsmm - send last streaming mode message
* @qp: sc qp struct
* @lsmm_buf: buffer with lsmm message
@@ -3147,7 +3221,7 @@ enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev, u8 hmc_fn_
i40iw_cqp_commit_fpm_values_cmd(dev, &query_fpm_mem, hmc_fn_id);
/* parse the fpm_commit_buf and fill hmc obj info */
- i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj);
+ i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj, &hmc_info->sd_table.sd_cnt);
mem_size = sizeof(struct i40iw_hmc_sd_entry) *
(hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index);
ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size);
@@ -3221,7 +3295,9 @@ static enum i40iw_status_code i40iw_sc_configure_iw_fpm(struct i40iw_sc_dev *dev
/* parse the fpm_commit_buf and fill hmc obj info */
if (!ret_code)
- ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf, hmc_info->hmc_obj);
+ ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf,
+ hmc_info->hmc_obj,
+ &hmc_info->sd_table.sd_cnt);
i40iw_debug_buf(dev, I40IW_DEBUG_HMC, "COMMIT FPM BUFFER",
commit_fpm_mem.va, I40IW_COMMIT_FPM_BUF_SIZE);
@@ -3469,6 +3545,40 @@ static bool i40iw_ring_full(struct i40iw_sc_cqp *cqp)
}
/**
+ * i40iw_est_sd - returns approximate number of SDs for HMC
+ * @dev: sc device struct
+ * @hmc_info: hmc structure, size and count for HMC objects
+ */
+static u64 i40iw_est_sd(struct i40iw_sc_dev *dev, struct i40iw_hmc_info *hmc_info)
+{
+ int i;
+ u64 size = 0;
+ u64 sd;
+
+ for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_PBLE; i++)
+ size += hmc_info->hmc_obj[i].cnt * hmc_info->hmc_obj[i].size;
+
+ if (dev->is_pf)
+ size += hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size;
+
+ if (size & 0x1FFFFF)
+ sd = (size >> 21) + 1; /* add 1 for remainder */
+ else
+ sd = size >> 21;
+
+ if (!dev->is_pf) {
+ /* 2MB alignment for VF PBLE HMC */
+ size = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size;
+ if (size & 0x1FFFFF)
+ sd += (size >> 21) + 1; /* add 1 for remainder */
+ else
+ sd += size >> 21;
+ }
+
+ return sd;
+}
+
+/**
* i40iw_config_fpm_values - configure HMC objects
* @dev: sc device struct
* @qp_count: desired qp count
@@ -3479,7 +3589,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
u32 i, mem_size;
u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted;
u32 powerof2;
- u64 sd_needed, bytes_needed;
+ u64 sd_needed;
u32 loop_count = 0;
struct i40iw_hmc_info *hmc_info;
@@ -3497,23 +3607,15 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
return ret_code;
}
- bytes_needed = 0;
- for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) {
+ for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++)
hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt;
- bytes_needed +=
- (hmc_info->hmc_obj[i].max_cnt) * (hmc_info->hmc_obj[i].size);
- i40iw_debug(dev, I40IW_DEBUG_HMC,
- "%s i[%04d] max_cnt[0x%04X] size[0x%04llx]\n",
- __func__, i, hmc_info->hmc_obj[i].max_cnt,
- hmc_info->hmc_obj[i].size);
- }
- sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1; /* round up */
+ sd_needed = i40iw_est_sd(dev, hmc_info);
i40iw_debug(dev, I40IW_DEBUG_HMC,
"%s: FW initial max sd_count[%08lld] first_sd_index[%04d]\n",
__func__, sd_needed, hmc_info->first_sd_index);
i40iw_debug(dev, I40IW_DEBUG_HMC,
- "%s: bytes_needed=0x%llx sd count %d where max sd is %d\n",
- __func__, bytes_needed, hmc_info->sd_table.sd_cnt,
+ "%s: sd count %d where max sd is %d\n",
+ __func__, hmc_info->sd_table.sd_cnt,
hmc_fpm_misc->max_sds);
qpwanted = min(qp_count, hmc_info->hmc_obj[I40IW_HMC_IW_QP].max_cnt);
@@ -3555,11 +3657,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt = pblewanted;
/* How much memory is needed for all the objects. */
- bytes_needed = 0;
- for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++)
- bytes_needed +=
- (hmc_info->hmc_obj[i].cnt) * (hmc_info->hmc_obj[i].size);
- sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1;
+ sd_needed = i40iw_est_sd(dev, hmc_info);
if ((loop_count > 1000) ||
((!(loop_count % 10)) &&
(qpwanted > qpwantedoriginal * 2 / 3))) {
@@ -3580,15 +3678,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
pblewanted -= FPM_MULTIPLIER * 1000;
} while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000);
- bytes_needed = 0;
- for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) {
- bytes_needed += (hmc_info->hmc_obj[i].cnt) * (hmc_info->hmc_obj[i].size);
- i40iw_debug(dev, I40IW_DEBUG_HMC,
- "%s i[%04d] cnt[0x%04x] size[0x%04llx]\n",
- __func__, i, hmc_info->hmc_obj[i].cnt,
- hmc_info->hmc_obj[i].size);
- }
- sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1; /* round up not truncate. */
+ sd_needed = i40iw_est_sd(dev, hmc_info);
i40iw_debug(dev, I40IW_DEBUG_HMC,
"loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n",
@@ -3606,8 +3696,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_
return ret_code;
}
- hmc_info->sd_table.sd_cnt = (u32)sd_needed;
-
mem_size = sizeof(struct i40iw_hmc_sd_entry) *
(hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index + 1);
ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size);
@@ -3911,11 +3999,11 @@ enum i40iw_status_code i40iw_process_bh(struct i40iw_sc_dev *dev)
*/
static u32 i40iw_iwarp_opcode(struct i40iw_aeqe_info *info, u8 *pkt)
{
- u16 *mpa;
+ __be16 *mpa;
u32 opcode = 0xffffffff;
if (info->q2_data_written) {
- mpa = (u16 *)pkt;
+ mpa = (__be16 *)pkt;
opcode = ntohs(mpa[1]) & 0xf;
}
return opcode;
@@ -3977,7 +4065,7 @@ static int i40iw_bld_terminate_hdr(struct i40iw_sc_qp *qp,
if (info->q2_data_written) {
/* Use data from offending packet to fill in ddp & rdma hdrs */
pkt = i40iw_locate_mpa(pkt);
- ddp_seg_len = ntohs(*(u16 *)pkt);
+ ddp_seg_len = ntohs(*(__be16 *)pkt);
if (ddp_seg_len) {
copy_len = 2;
termhdr->hdrct = DDP_LEN_FLAG;
@@ -4188,13 +4276,13 @@ void i40iw_terminate_connection(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *
void i40iw_terminate_received(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info)
{
u8 *pkt = qp->q2_buf + Q2_BAD_FRAME_OFFSET;
- u32 *mpa;
+ __be32 *mpa;
u8 ddp_ctl;
u8 rdma_ctl;
u16 aeq_id = 0;
struct i40iw_terminate_hdr *termhdr;
- mpa = (u32 *)i40iw_locate_mpa(pkt);
+ mpa = (__be32 *)i40iw_locate_mpa(pkt);
if (info->q2_data_written) {
/* did not validate the frame - do it now */
ddp_ctl = (ntohl(mpa[0]) >> 8) & 0xff;
@@ -4559,17 +4647,18 @@ static struct i40iw_pd_ops iw_pd_ops = {
};
static struct i40iw_priv_qp_ops iw_priv_qp_ops = {
- i40iw_sc_qp_init,
- i40iw_sc_qp_create,
- i40iw_sc_qp_modify,
- i40iw_sc_qp_destroy,
- i40iw_sc_qp_flush_wqes,
- i40iw_sc_qp_upload_context,
- i40iw_sc_qp_setctx,
- i40iw_sc_send_lsmm,
- i40iw_sc_send_lsmm_nostag,
- i40iw_sc_send_rtt,
- i40iw_sc_post_wqe0,
+ .qp_init = i40iw_sc_qp_init,
+ .qp_create = i40iw_sc_qp_create,
+ .qp_modify = i40iw_sc_qp_modify,
+ .qp_destroy = i40iw_sc_qp_destroy,
+ .qp_flush_wqes = i40iw_sc_qp_flush_wqes,
+ .qp_upload_context = i40iw_sc_qp_upload_context,
+ .qp_setctx = i40iw_sc_qp_setctx,
+ .qp_send_lsmm = i40iw_sc_send_lsmm,
+ .qp_send_lsmm_nostag = i40iw_sc_send_lsmm_nostag,
+ .qp_send_rtt = i40iw_sc_send_rtt,
+ .qp_post_wqe0 = i40iw_sc_post_wqe0,
+ .iw_mr_fast_register = i40iw_sc_mr_fast_register
};
static struct i40iw_priv_cq_ops iw_priv_cq_ops = {
diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h
index aab88d65f805..2fac1db0e0a0 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_d.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_d.h
@@ -1290,7 +1290,7 @@
/* wqe size considering 32 bytes per wqe*/
#define I40IWQP_SW_MIN_WQSIZE 4 /* 128 bytes */
-#define I40IWQP_SW_MAX_WQSIZE 16384 /* 524288 bytes */
+#define I40IWQP_SW_MAX_WQSIZE 2048 /* 2048 bytes */
#define I40IWQP_OP_RDMA_WRITE 0
#define I40IWQP_OP_RDMA_READ 1
@@ -1512,6 +1512,8 @@ enum i40iw_alignment {
I40IW_SD_BUF_ALIGNMENT = 0x100
};
+#define I40IW_WQE_SIZE_64 64
+
#define I40IW_QP_WQE_MIN_SIZE 32
#define I40IW_QP_WQE_MAX_SIZE 128
@@ -1555,6 +1557,9 @@ enum i40iw_alignment {
#define I40IW_RING_MOVE_TAIL(_ring) \
(_ring).tail = ((_ring).tail + 1) % (_ring).size
+#define I40IW_RING_MOVE_HEAD_NOCHECK(_ring) \
+ (_ring).head = ((_ring).head + 1) % (_ring).size
+
#define I40IW_RING_MOVE_TAIL_BY_COUNT(_ring, _count) \
(_ring).tail = ((_ring).tail + (_count)) % (_ring).size
diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c
index 9fd302425563..0c92a40b3e86 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_hw.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c
@@ -106,7 +106,9 @@ u32 i40iw_initialize_hw_resources(struct i40iw_device *iwdev)
set_bit(2, iwdev->allocated_pds);
spin_lock_init(&iwdev->resource_lock);
- mrdrvbits = 24 - get_count_order(iwdev->max_mr);
+ spin_lock_init(&iwdev->qptable_lock);
+ /* stag index mask has a minimum of 14 bits */
+ mrdrvbits = 24 - max(get_count_order(iwdev->max_mr), 14);
iwdev->mr_stagmask = ~(((1 << mrdrvbits) - 1) << (32 - mrdrvbits));
return 0;
}
@@ -263,6 +265,7 @@ void i40iw_next_iw_state(struct i40iw_qp *iwqp,
info.dont_send_fin = false;
if (iwqp->sc_qp.term_flags && (state == I40IW_QP_STATE_ERROR))
info.reset_tcp_conn = true;
+ iwqp->hw_iwarp_state = state;
i40iw_hw_modify_qp(iwqp->iwdev, iwqp, &info, 0);
}
@@ -301,11 +304,15 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
"%s ae_id = 0x%x bool qp=%d qp_id = %d\n",
__func__, info->ae_id, info->qp, info->qp_cq_id);
if (info->qp) {
+ spin_lock_irqsave(&iwdev->qptable_lock, flags);
iwqp = iwdev->qp_table[info->qp_cq_id];
if (!iwqp) {
+ spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
i40iw_pr_err("qp_id %d is already freed\n", info->qp_cq_id);
continue;
}
+ i40iw_add_ref(&iwqp->ibqp);
+ spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
qp = &iwqp->sc_qp;
spin_lock_irqsave(&iwqp->lock, flags);
iwqp->hw_tcp_state = info->tcp_state;
@@ -411,6 +418,8 @@ void i40iw_process_aeq(struct i40iw_device *iwdev)
i40iw_terminate_connection(qp, info);
break;
}
+ if (info->qp)
+ i40iw_rem_ref(&iwqp->ibqp);
} while (1);
if (aeqcnt)
@@ -460,7 +469,7 @@ int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool ad
*/
void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
unsigned char *mac_addr,
- __be32 *ip_addr,
+ u32 *ip_addr,
bool ipv4,
u32 action)
{
@@ -481,7 +490,7 @@ void i40iw_manage_arp_cache(struct i40iw_device *iwdev,
cqp_info->cqp_cmd = OP_ADD_ARP_CACHE_ENTRY;
info = &cqp_info->in.u.add_arp_cache_entry.info;
memset(info, 0, sizeof(*info));
- info->arp_index = cpu_to_le32(arp_index);
+ info->arp_index = cpu_to_le16((u16)arp_index);
info->permanent = true;
ether_addr_copy(info->mac_addr, mac_addr);
cqp_info->in.u.add_arp_cache_entry.scratch = (uintptr_t)cqp_request;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index 90e5af21737e..445e230d5ff8 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -100,7 +100,7 @@ static struct notifier_block i40iw_net_notifier = {
.notifier_call = i40iw_net_event
};
-static int i40iw_notifiers_registered;
+static atomic_t i40iw_notifiers_registered;
/**
* i40iw_find_i40e_handler - find a handler given a client info
@@ -270,7 +270,6 @@ static void i40iw_disable_irq(struct i40iw_sc_dev *dev,
i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_vec->idx - 1), 0);
else
i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_vec->idx - 1), 0);
- synchronize_irq(msix_vec->irq);
free_irq(msix_vec->irq, dev_id);
}
@@ -601,8 +600,7 @@ static enum i40iw_status_code i40iw_create_cqp(struct i40iw_device *iwdev)
cqp_init_info.scratch_array = cqp->scratch_array;
status = dev->cqp_ops->cqp_init(dev->cqp, &cqp_init_info);
if (status) {
- i40iw_pr_err("cqp init status %d maj_err %d min_err %d\n",
- status, maj_err, min_err);
+ i40iw_pr_err("cqp init status %d\n", status);
goto exit;
}
status = dev->cqp_ops->cqp_create(dev->cqp, true, &maj_err, &min_err);
@@ -1147,10 +1145,7 @@ static enum i40iw_status_code i40iw_alloc_set_mac_ipaddr(struct i40iw_device *iw
if (!status) {
status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr,
(u8)iwdev->mac_ip_table_idx);
- if (!status)
- status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr,
- (u8)iwdev->mac_ip_table_idx);
- else
+ if (status)
i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx);
}
return status;
@@ -1165,7 +1160,7 @@ static void i40iw_add_ipv6_addr(struct i40iw_device *iwdev)
struct net_device *ip_dev;
struct inet6_dev *idev;
struct inet6_ifaddr *ifp;
- __be32 local_ipaddr6[4];
+ u32 local_ipaddr6[4];
rcu_read_lock();
for_each_netdev_rcu(&init_net, ip_dev) {
@@ -1347,12 +1342,11 @@ exit:
*/
static void i40iw_register_notifiers(void)
{
- if (!i40iw_notifiers_registered) {
+ if (atomic_inc_return(&i40iw_notifiers_registered) == 1) {
register_inetaddr_notifier(&i40iw_inetaddr_notifier);
register_inet6addr_notifier(&i40iw_inetaddr6_notifier);
register_netevent_notifier(&i40iw_net_notifier);
}
- i40iw_notifiers_registered++;
}
/**
@@ -1434,8 +1428,7 @@ static void i40iw_deinit_device(struct i40iw_device *iwdev, bool reset, bool del
i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx);
/* fallthrough */
case INET_NOTIFIER:
- if (i40iw_notifiers_registered > 0) {
- i40iw_notifiers_registered--;
+ if (!atomic_dec_return(&i40iw_notifiers_registered)) {
unregister_netevent_notifier(&i40iw_net_notifier);
unregister_inetaddr_notifier(&i40iw_inetaddr_notifier);
unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier);
@@ -1512,6 +1505,7 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl,
I40IW_HMC_PROFILE_DEFAULT;
iwdev->max_rdma_vfs =
(iwdev->resource_profile != I40IW_HMC_PROFILE_DEFAULT) ? max_rdma_vfs : 0;
+ iwdev->max_enabled_vfs = iwdev->max_rdma_vfs;
iwdev->netdev = ldev->netdev;
hdl->client = client;
iwdev->mss = (!ldev->params.mtu) ? I40IW_DEFAULT_MSS : ldev->params.mtu - I40IW_MTU_TO_MSS;
@@ -1531,7 +1525,10 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl,
goto exit;
iwdev->obj_next = iwdev->obj_mem;
iwdev->push_mode = push_mode;
+
init_waitqueue_head(&iwdev->vchnl_waitq);
+ init_waitqueue_head(&dev->vf_reqs);
+
status = i40iw_initialize_dev(iwdev, ldev);
exit:
if (status) {
@@ -1559,6 +1556,10 @@ static int i40iw_open(struct i40e_info *ldev, struct i40e_client *client)
enum i40iw_status_code status;
struct i40iw_handler *hdl;
+ hdl = i40iw_find_netdev(ldev->netdev);
+ if (hdl)
+ return 0;
+
hdl = kzalloc(sizeof(*hdl), GFP_KERNEL);
if (!hdl)
return -ENOMEM;
@@ -1710,7 +1711,6 @@ static void i40iw_vf_reset(struct i40e_info *ldev, struct i40e_client *client, u
for (i = 0; i < I40IW_MAX_PE_ENABLED_VF_COUNT; i++) {
if (!dev->vf_dev[i] || (dev->vf_dev[i]->vf_id != vf_id))
continue;
-
/* free all resources allocated on behalf of vf */
tmp_vfdev = dev->vf_dev[i];
spin_lock_irqsave(&dev->dev_pestat.stats_lock, flags);
@@ -1819,8 +1819,6 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev,
dev = &hdl->device.sc_dev;
iwdev = dev->back_dev;
- i40iw_debug(dev, I40IW_DEBUG_VIRT, "msg %p, message length %u\n", msg, len);
-
if (dev->vchnl_if.vchnl_recv) {
ret_code = dev->vchnl_if.vchnl_recv(dev, vf_id, msg, len);
if (!dev->is_pf) {
@@ -1832,6 +1830,39 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev,
}
/**
+ * i40iw_vf_clear_to_send - wait to send virtual channel message
+ * @dev: iwarp device *
+ * Wait for until virtual channel is clear
+ * before sending the next message
+ *
+ * Returns false if error
+ * Returns true if clear to send
+ */
+bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev)
+{
+ struct i40iw_device *iwdev;
+ wait_queue_t wait;
+
+ iwdev = dev->back_dev;
+
+ if (!wq_has_sleeper(&dev->vf_reqs) &&
+ (atomic_read(&iwdev->vchnl_msgs) == 0))
+ return true; /* virtual channel is clear */
+
+ init_wait(&wait);
+ add_wait_queue_exclusive(&dev->vf_reqs, &wait);
+
+ if (!wait_event_timeout(dev->vf_reqs,
+ (atomic_read(&iwdev->vchnl_msgs) == 0),
+ I40IW_VCHNL_EVENT_TIMEOUT))
+ dev->vchnl_up = false;
+
+ remove_wait_queue(&dev->vf_reqs, &wait);
+
+ return dev->vchnl_up;
+}
+
+/**
* i40iw_virtchnl_send - send a message through the virtual channel
* @dev: iwarp device
* @vf_id: virtual function id associated with the message
@@ -1848,22 +1879,20 @@ static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev,
{
struct i40iw_device *iwdev;
struct i40e_info *ldev;
- enum i40iw_status_code ret_code = I40IW_ERR_BAD_PTR;
if (!dev || !dev->back_dev)
- return ret_code;
+ return I40IW_ERR_BAD_PTR;
iwdev = dev->back_dev;
ldev = iwdev->ldev;
if (ldev && ldev->ops && ldev->ops->virtchnl_send)
- ret_code = ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len);
-
- return ret_code;
+ return ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len);
+ return I40IW_ERR_BAD_PTR;
}
/* client interface functions */
-static struct i40e_client_ops i40e_ops = {
+static const struct i40e_client_ops i40e_ops = {
.open = i40iw_open,
.close = i40iw_close,
.l2_param_change = i40iw_l2param_change,
diff --git a/drivers/infiniband/hw/i40iw/i40iw_osdep.h b/drivers/infiniband/hw/i40iw/i40iw_osdep.h
index 7e20493510e8..80f422bf3967 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_osdep.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_osdep.h
@@ -172,6 +172,7 @@ struct i40iw_hw;
u8 __iomem *i40iw_get_hw_addr(void *dev);
void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp);
enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev);
+bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev);
enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc, void *addr,
u32 length, u32 value);
struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev, struct i40iw_puda_buf *buf);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.c b/drivers/infiniband/hw/i40iw/i40iw_pble.c
index ded853d2fad8..85993dc44f6e 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_pble.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_pble.c
@@ -404,13 +404,14 @@ static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev,
sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa;
if (sd_entry->valid)
return 0;
- if (dev->is_pf)
+ if (dev->is_pf) {
ret_code = i40iw_hmc_sd_one(dev, hmc_info->hmc_fn_id,
sd_reg_val, idx->sd_idx,
sd_entry->entry_type, true);
- if (ret_code) {
- i40iw_pr_err("cqp cmd failed for sd (pbles)\n");
- goto error;
+ if (ret_code) {
+ i40iw_pr_err("cqp cmd failed for sd (pbles)\n");
+ goto error;
+ }
}
sd_entry->valid = true;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c
index 8eb400d8a7a0..c62d354f7810 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_puda.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c
@@ -1025,6 +1025,8 @@ static void i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq,
u16 txoffset, bufoffset;
buf = i40iw_puda_get_listbuf(pbufl);
+ if (!buf)
+ return;
nextseqnum = buf->seqnum + fpdu_len;
txbuf->totallen = buf->hdrlen + fpdu_len;
txbuf->data = (u8 *)txbuf->mem.va + buf->hdrlen;
@@ -1048,6 +1050,8 @@ static void i40iw_ieq_compl_pfpdu(struct i40iw_puda_rsrc *ieq,
fpdu_len -= buf->datalen;
i40iw_puda_ret_bufpool(ieq, buf);
buf = i40iw_puda_get_listbuf(pbufl);
+ if (!buf)
+ return;
bufoffset = (u16)(buf->data - (u8 *)buf->mem.va);
} while (1);
@@ -1194,7 +1198,7 @@ static enum i40iw_status_code i40iw_ieq_process_buf(struct i40iw_puda_rsrc *ieq,
ioffset = (u16)(buf->data - (u8 *)buf->mem.va);
while (datalen) {
- fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(u16 *)datap));
+ fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(__be16 *)datap));
if (fpdu_len > pfpdu->max_fpdu_data) {
i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ,
"%s: error bad fpdu_len\n", __func__);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_status.h b/drivers/infiniband/hw/i40iw/i40iw_status.h
index b0110c15e044..91c421762f06 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_status.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_status.h
@@ -95,6 +95,7 @@ enum i40iw_status_code {
I40IW_ERR_INVALID_MAC_ADDR = -65,
I40IW_ERR_BAD_STAG = -66,
I40IW_ERR_CQ_COMPL_ERROR = -67,
+ I40IW_ERR_QUEUE_DESTROYED = -68
};
#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h
index edb3a8c8267a..2b1a04e9ca3c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_type.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_type.h
@@ -479,16 +479,17 @@ struct i40iw_sc_dev {
struct i40iw_virt_mem ieq_mem;
struct i40iw_puda_rsrc *ieq;
- struct i40iw_vf_cqp_ops *iw_vf_cqp_ops;
+ const struct i40iw_vf_cqp_ops *iw_vf_cqp_ops;
struct i40iw_hmc_fpm_misc hmc_fpm_misc;
u16 qs_handle;
- u32 debug_mask;
+ u32 debug_mask;
u16 exception_lan_queue;
u8 hmc_fn_id;
bool is_pf;
bool vchnl_up;
u8 vf_id;
+ wait_queue_head_t vf_reqs;
u64 cqp_cmd_stats[OP_SIZE_CQP_STAT_ARRAY];
struct i40iw_vchnl_vf_msg_buffer vchnl_vf_msg_buf;
u8 hw_rev;
@@ -666,7 +667,7 @@ struct i40iw_tcp_offload_info {
bool time_stamp;
u8 cwnd_inc_limit;
bool drop_ooo_seg;
- bool dup_ack_thresh;
+ u8 dup_ack_thresh;
u8 ttl;
u8 src_mac_addr_idx;
bool avoid_stretch_ack;
@@ -889,8 +890,8 @@ struct i40iw_qhash_table_info {
u32 qp_num;
u32 dest_ip[4];
u32 src_ip[4];
- u32 dest_port;
- u32 src_port;
+ u16 dest_port;
+ u16 src_port;
};
struct i40iw_local_mac_ipaddr_entry_info {
@@ -1040,6 +1041,9 @@ struct i40iw_priv_qp_ops {
void (*qp_send_lsmm_nostag)(struct i40iw_sc_qp *, void *, u32);
void (*qp_send_rtt)(struct i40iw_sc_qp *, bool);
enum i40iw_status_code (*qp_post_wqe0)(struct i40iw_sc_qp *, u8);
+ enum i40iw_status_code (*iw_mr_fast_register)(struct i40iw_sc_qp *,
+ struct i40iw_fast_reg_stag_info *,
+ bool);
};
struct i40iw_priv_cq_ops {
@@ -1108,7 +1112,7 @@ struct i40iw_hmc_ops {
enum i40iw_status_code (*parse_fpm_query_buf)(u64 *, struct i40iw_hmc_info *,
struct i40iw_hmc_fpm_misc *);
enum i40iw_status_code (*configure_iw_fpm)(struct i40iw_sc_dev *, u8);
- enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *);
+ enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *, u32 *sd);
enum i40iw_status_code (*create_hmc_object)(struct i40iw_sc_dev *dev,
struct i40iw_hmc_create_obj_info *);
enum i40iw_status_code (*del_hmc_object)(struct i40iw_sc_dev *dev,
diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c
index f78c3dc8bdb2..4d28c3cb03cc 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_uk.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c
@@ -56,6 +56,9 @@ static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp)
wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
wqe = qp->sq_base[wqe_idx].elem;
+
+ qp->sq_wrtrk_array[wqe_idx].wqe_size = I40IW_QP_WQE_MIN_SIZE;
+
peek_head = (qp->sq_ring.head + 1) % qp->sq_ring.size;
wqe_0 = qp->sq_base[peek_head].elem;
if (peek_head)
@@ -130,7 +133,10 @@ static void i40iw_qp_ring_push_db(struct i40iw_qp_uk *qp, u32 wqe_idx)
*/
u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp,
u32 *wqe_idx,
- u8 wqe_size)
+ u8 wqe_size,
+ u32 total_size,
+ u64 wr_id
+ )
{
u64 *wqe = NULL;
u64 wqe_ptr;
@@ -159,6 +165,17 @@ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp,
if (!*wqe_idx)
qp->swqe_polarity = !qp->swqe_polarity;
}
+
+ if (((*wqe_idx & 3) == 1) && (wqe_size == I40IW_WQE_SIZE_64)) {
+ i40iw_nop_1(qp);
+ I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
+ if (ret_code)
+ return NULL;
+ *wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+ if (!*wqe_idx)
+ qp->swqe_polarity = !qp->swqe_polarity;
+ }
+
for (i = 0; i < wqe_size / I40IW_QP_WQE_MIN_SIZE; i++) {
I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code);
if (ret_code)
@@ -169,8 +186,15 @@ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp,
peek_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
wqe_0 = qp->sq_base[peek_head].elem;
- if (peek_head & 0x3)
- wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID);
+
+ if (((peek_head & 3) == 1) || ((peek_head & 3) == 3)) {
+ if (RS_64(wqe_0[3], I40IWQPSQ_VALID) != !qp->swqe_polarity)
+ wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID);
+ }
+
+ qp->sq_wrtrk_array[*wqe_idx].wrid = wr_id;
+ qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size;
+ qp->sq_wrtrk_array[*wqe_idx].wqe_size = wqe_size;
return wqe;
}
@@ -249,12 +273,9 @@ static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp,
if (ret_code)
return ret_code;
- wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+ wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id);
if (!wqe)
return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
- qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
- qp->sq_wrtrk_array[wqe_idx].wr_len = total_size;
set_64bit_val(wqe, 16,
LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
if (!op_info->rem_addr.stag)
@@ -270,9 +291,9 @@ static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp,
i40iw_set_fragment(wqe, 0, op_info->lo_sg_list);
- for (i = 1; i < op_info->num_lo_sges; i++) {
- byte_off = 32 + (i - 1) * 16;
+ for (i = 1, byte_off = 32; i < op_info->num_lo_sges; i++) {
i40iw_set_fragment(wqe, byte_off, &op_info->lo_sg_list[i]);
+ byte_off += 16;
}
wmb(); /* make sure WQE is populated before valid bit is set */
@@ -309,12 +330,9 @@ static enum i40iw_status_code i40iw_rdma_read(struct i40iw_qp_uk *qp,
ret_code = i40iw_fragcnt_to_wqesize_sq(1, &wqe_size);
if (ret_code)
return ret_code;
- wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+ wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->lo_addr.len, info->wr_id);
if (!wqe)
return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
- qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
- qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->lo_addr.len;
local_fence |= info->local_fence;
set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
@@ -366,13 +384,11 @@ static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp,
if (ret_code)
return ret_code;
- wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+ wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id);
if (!wqe)
return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
read_fence |= info->read_fence;
- qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
- qp->sq_wrtrk_array[wqe_idx].wr_len = total_size;
set_64bit_val(wqe, 16, 0);
header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) |
LS_64(info->op_type, I40IWQPSQ_OPCODE) |
@@ -385,9 +401,9 @@ static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp,
i40iw_set_fragment(wqe, 0, op_info->sg_list);
- for (i = 1; i < op_info->num_sges; i++) {
- byte_off = 32 + (i - 1) * 16;
+ for (i = 1, byte_off = 32; i < op_info->num_sges; i++) {
i40iw_set_fragment(wqe, byte_off, &op_info->sg_list[i]);
+ byte_off += 16;
}
wmb(); /* make sure WQE is populated before valid bit is set */
@@ -427,13 +443,11 @@ static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp,
if (ret_code)
return ret_code;
- wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+ wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id);
if (!wqe)
return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
read_fence |= info->read_fence;
- qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
- qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->len;
set_64bit_val(wqe, 16,
LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO));
@@ -507,14 +521,11 @@ static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp,
if (ret_code)
return ret_code;
- wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size);
+ wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id);
if (!wqe)
return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
read_fence |= info->read_fence;
-
- qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
- qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->len;
header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) |
LS_64(info->op_type, I40IWQPSQ_OPCODE) |
LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) |
@@ -574,12 +585,9 @@ static enum i40iw_status_code i40iw_stag_local_invalidate(struct i40iw_qp_uk *qp
op_info = &info->op.inv_local_stag;
local_fence = info->local_fence;
- wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE);
+ wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id);
if (!wqe)
return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
- qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
- qp->sq_wrtrk_array[wqe_idx].wr_len = 0;
set_64bit_val(wqe, 0, 0);
set_64bit_val(wqe, 8,
LS_64(op_info->target_stag, I40IWQPSQ_LOCSTAG));
@@ -619,12 +627,9 @@ static enum i40iw_status_code i40iw_mw_bind(struct i40iw_qp_uk *qp,
op_info = &info->op.bind_window;
local_fence |= info->local_fence;
- wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE);
+ wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id);
if (!wqe)
return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
- qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id;
- qp->sq_wrtrk_array[wqe_idx].wr_len = 0;
set_64bit_val(wqe, 0, (uintptr_t)op_info->va);
set_64bit_val(wqe, 8,
LS_64(op_info->mr_stag, I40IWQPSQ_PARENTMRSTAG) |
@@ -680,9 +685,9 @@ static enum i40iw_status_code i40iw_post_receive(struct i40iw_qp_uk *qp,
i40iw_set_fragment(wqe, 0, info->sg_list);
- for (i = 1; i < info->num_sges; i++) {
- byte_off = 32 + (i - 1) * 16;
+ for (i = 1, byte_off = 32; i < info->num_sges; i++) {
i40iw_set_fragment(wqe, byte_off, &info->sg_list[i]);
+ byte_off += 16;
}
wmb(); /* make sure WQE is populated before valid bit is set */
@@ -748,8 +753,7 @@ static enum i40iw_status_code i40iw_cq_post_entries(struct i40iw_cq_uk *cq,
* @post_cq: update cq tail
*/
static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
- struct i40iw_cq_poll_info *info,
- bool post_cq)
+ struct i40iw_cq_poll_info *info)
{
u64 comp_ctx, qword0, qword2, qword3, wqe_qword;
u64 *cqe, *sw_wqe;
@@ -757,10 +761,9 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
struct i40iw_ring *pring = NULL;
u32 wqe_idx, q_type, array_idx = 0;
enum i40iw_status_code ret_code = 0;
- enum i40iw_status_code ret_code2 = 0;
bool move_cq_head = true;
u8 polarity;
- u8 addl_frag_cnt, addl_wqes = 0;
+ u8 addl_wqes = 0;
if (cq->avoid_mem_cflct)
cqe = (u64 *)I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(cq);
@@ -797,6 +800,10 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
info->is_srq = (bool)RS_64(qword3, I40IWCQ_SRQ);
qp = (struct i40iw_qp_uk *)(unsigned long)comp_ctx;
+ if (!qp) {
+ ret_code = I40IW_ERR_QUEUE_DESTROYED;
+ goto exit;
+ }
wqe_idx = (u32)RS_64(qword3, I40IW_CQ_WQEIDX);
info->qp_handle = (i40iw_qp_handle)(unsigned long)qp;
@@ -827,11 +834,8 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
info->op_type = (u8)RS_64(qword3, I40IWCQ_OP);
sw_wqe = qp->sq_base[wqe_idx].elem;
get_64bit_val(sw_wqe, 24, &wqe_qword);
- addl_frag_cnt =
- (u8)RS_64(wqe_qword, I40IWQPSQ_ADDFRAGCNT);
- i40iw_fragcnt_to_wqesize_sq(addl_frag_cnt + 1, &addl_wqes);
- addl_wqes = (addl_wqes / I40IW_QP_WQE_MIN_SIZE);
+ addl_wqes = qp->sq_wrtrk_array[wqe_idx].wqe_size / I40IW_QP_WQE_MIN_SIZE;
I40IW_RING_SET_TAIL(qp->sq_ring, (wqe_idx + addl_wqes));
} else {
do {
@@ -843,9 +847,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
get_64bit_val(sw_wqe, 24, &wqe_qword);
op_type = (u8)RS_64(wqe_qword, I40IWQPSQ_OPCODE);
info->op_type = op_type;
- addl_frag_cnt = (u8)RS_64(wqe_qword, I40IWQPSQ_ADDFRAGCNT);
- i40iw_fragcnt_to_wqesize_sq(addl_frag_cnt + 1, &addl_wqes);
- addl_wqes = (addl_wqes / I40IW_QP_WQE_MIN_SIZE);
+ addl_wqes = qp->sq_wrtrk_array[tail].wqe_size / I40IW_QP_WQE_MIN_SIZE;
I40IW_RING_SET_TAIL(qp->sq_ring, (tail + addl_wqes));
if (op_type != I40IWQP_OP_NOP) {
info->wr_id = qp->sq_wrtrk_array[tail].wrid;
@@ -859,25 +861,21 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
ret_code = 0;
+exit:
if (!ret_code &&
(info->comp_status == I40IW_COMPL_STATUS_FLUSHED))
if (pring && (I40IW_RING_MORE_WORK(*pring)))
move_cq_head = false;
if (move_cq_head) {
- I40IW_RING_MOVE_HEAD(cq->cq_ring, ret_code2);
-
- if (ret_code2 && !ret_code)
- ret_code = ret_code2;
+ I40IW_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
if (I40IW_RING_GETCURRENT_HEAD(cq->cq_ring) == 0)
cq->polarity ^= 1;
- if (post_cq) {
- I40IW_RING_MOVE_TAIL(cq->cq_ring);
- set_64bit_val(cq->shadow_area, 0,
- I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
- }
+ I40IW_RING_MOVE_TAIL(cq->cq_ring);
+ set_64bit_val(cq->shadow_area, 0,
+ I40IW_RING_GETCURRENT_HEAD(cq->cq_ring));
} else {
if (info->is_srq)
return ret_code;
@@ -893,19 +891,21 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq,
* i40iw_get_wqe_shift - get shift count for maximum wqe size
* @wqdepth: depth of wq required.
* @sge: Maximum Scatter Gather Elements wqe
+ * @inline_data: Maximum inline data size
* @shift: Returns the shift needed based on sge
*
- * Shift can be used to left shift the wqe size based on sge.
- * If sge, == 1, shift =0 (wqe_size of 32 bytes), for sge=2 and 3, shift =1
- * (64 bytes wqes) and 2 otherwise (128 bytes wqe).
+ * Shift can be used to left shift the wqe size based on number of SGEs and inlind data size.
+ * For 1 SGE or inline data <= 16, shift = 0 (wqe size of 32 bytes).
+ * For 2 or 3 SGEs or inline data <= 48, shift = 1 (wqe size of 64 bytes).
+ * Shift of 2 otherwise (wqe size of 128 bytes).
*/
-enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u8 sge, u8 *shift)
+enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u32 sge, u32 inline_data, u8 *shift)
{
u32 size;
*shift = 0;
- if (sge > 1)
- *shift = (sge < 4) ? 1 : 2;
+ if (sge > 1 || inline_data > 16)
+ *shift = (sge < 4 && inline_data <= 48) ? 1 : 2;
/* check if wqdepth is multiple of 2 or not */
@@ -968,11 +968,11 @@ enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp,
if (info->max_rq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT)
return I40IW_ERR_INVALID_FRAG_COUNT;
- ret_code = i40iw_get_wqe_shift(info->sq_size, info->max_sq_frag_cnt, &sqshift);
+ ret_code = i40iw_get_wqe_shift(info->sq_size, info->max_sq_frag_cnt, info->max_inline_data, &sqshift);
if (ret_code)
return ret_code;
- ret_code = i40iw_get_wqe_shift(info->rq_size, info->max_rq_frag_cnt, &rqshift);
+ ret_code = i40iw_get_wqe_shift(info->rq_size, info->max_rq_frag_cnt, 0, &rqshift);
if (ret_code)
return ret_code;
@@ -1097,12 +1097,9 @@ enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp,
u64 header, *wqe;
u32 wqe_idx;
- wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE);
+ wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, wr_id);
if (!wqe)
return I40IW_ERR_QP_TOOMANY_WRS_POSTED;
-
- qp->sq_wrtrk_array[wqe_idx].wrid = wr_id;
- qp->sq_wrtrk_array[wqe_idx].wr_len = 0;
set_64bit_val(wqe, 0, 0);
set_64bit_val(wqe, 8, 0);
set_64bit_val(wqe, 16, 0);
@@ -1125,7 +1122,7 @@ enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp,
* @frag_cnt: number of fragments
* @wqe_size: size of sq wqe returned
*/
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size)
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size)
{
switch (frag_cnt) {
case 0:
@@ -1156,7 +1153,7 @@ enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size)
* @frag_cnt: number of fragments
* @wqe_size: size of rq wqe returned
*/
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u8 frag_cnt, u8 *wqe_size)
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size)
{
switch (frag_cnt) {
case 0:
diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h
index 5cd971bb8cc7..276bcefffd7e 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_user.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_user.h
@@ -61,7 +61,7 @@ enum i40iw_device_capabilities_const {
I40IW_MAX_CQ_SIZE = 1048575,
I40IW_MAX_AEQ_ALLOCATE_COUNT = 255,
I40IW_DB_ID_ZERO = 0,
- I40IW_MAX_WQ_FRAGMENT_COUNT = 6,
+ I40IW_MAX_WQ_FRAGMENT_COUNT = 3,
I40IW_MAX_SGE_RD = 1,
I40IW_MAX_OUTBOUND_MESSAGE_SIZE = 2147483647,
I40IW_MAX_INBOUND_MESSAGE_SIZE = 2147483647,
@@ -70,8 +70,8 @@ enum i40iw_device_capabilities_const {
I40IW_MAX_VF_FPM_ID = 47,
I40IW_MAX_VF_PER_PF = 127,
I40IW_MAX_SQ_PAYLOAD_SIZE = 2145386496,
- I40IW_MAX_INLINE_DATA_SIZE = 112,
- I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE = 112,
+ I40IW_MAX_INLINE_DATA_SIZE = 48,
+ I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE = 48,
I40IW_MAX_IRD_SIZE = 32,
I40IW_QPCTX_ENCD_MAXIRD = 3,
I40IW_MAX_WQ_ENTRIES = 2048,
@@ -102,6 +102,8 @@ enum i40iw_device_capabilities_const {
#define I40IW_STAG_INDEX_FROM_STAG(stag) (((stag) && 0xFFFFFF00) >> 8)
+#define I40IW_MAX_MR_SIZE 0x10000000000L
+
struct i40iw_qp_uk;
struct i40iw_cq_uk;
struct i40iw_srq_uk;
@@ -198,7 +200,7 @@ enum i40iw_completion_notify {
struct i40iw_post_send {
i40iw_sgl sg_list;
- u8 num_sges;
+ u32 num_sges;
};
struct i40iw_post_inline_send {
@@ -220,7 +222,7 @@ struct i40iw_post_inline_send_w_inv {
struct i40iw_rdma_write {
i40iw_sgl lo_sg_list;
- u8 num_lo_sges;
+ u32 num_lo_sges;
struct i40iw_sge rem_addr;
};
@@ -325,7 +327,7 @@ struct i40iw_cq_ops {
void (*iw_cq_request_notification)(struct i40iw_cq_uk *,
enum i40iw_completion_notify);
enum i40iw_status_code (*iw_cq_poll_completion)(struct i40iw_cq_uk *,
- struct i40iw_cq_poll_info *, bool);
+ struct i40iw_cq_poll_info *);
enum i40iw_status_code (*iw_cq_post_entries)(struct i40iw_cq_uk *, u8 count);
void (*iw_cq_clean)(void *, struct i40iw_cq_uk *);
};
@@ -345,7 +347,9 @@ struct i40iw_dev_uk {
struct i40iw_sq_uk_wr_trk_info {
u64 wrid;
- u64 wr_len;
+ u32 wr_len;
+ u8 wqe_size;
+ u8 reserved[3];
};
struct i40iw_qp_quanta {
@@ -367,6 +371,8 @@ struct i40iw_qp_uk {
u32 qp_id;
u32 sq_size;
u32 rq_size;
+ u32 max_sq_frag_cnt;
+ u32 max_rq_frag_cnt;
struct i40iw_qp_uk_ops ops;
bool use_srq;
u8 swqe_polarity;
@@ -374,8 +380,6 @@ struct i40iw_qp_uk {
u8 rwqe_polarity;
u8 rq_wqe_size;
u8 rq_wqe_size_multiplier;
- u8 max_sq_frag_cnt;
- u8 max_rq_frag_cnt;
bool deferred_flag;
};
@@ -404,8 +408,9 @@ struct i40iw_qp_uk_init_info {
u32 qp_id;
u32 sq_size;
u32 rq_size;
- u8 max_sq_frag_cnt;
- u8 max_rq_frag_cnt;
+ u32 max_sq_frag_cnt;
+ u32 max_rq_frag_cnt;
+ u32 max_inline_data;
};
@@ -422,7 +427,10 @@ void i40iw_device_init_uk(struct i40iw_dev_uk *dev);
void i40iw_qp_post_wr(struct i40iw_qp_uk *qp);
u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx,
- u8 wqe_size);
+ u8 wqe_size,
+ u32 total_size,
+ u64 wr_id
+ );
u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx);
u64 *i40iw_qp_get_next_srq_wqe(struct i40iw_srq_uk *srq, u32 *wqe_idx);
@@ -434,9 +442,9 @@ enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp,
void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq);
enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, u64 wr_id,
bool signaled, bool post_sq);
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size);
-enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u8 frag_cnt, u8 *wqe_size);
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size);
+enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size);
enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size,
u8 *wqe_size);
-enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u8 sge, u8 *shift);
+enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u32 sge, u32 inline_data, u8 *shift);
#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index 1ceec81bd8eb..6fd043b1d714 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -59,7 +59,7 @@
* @action: modify, delete or add
*/
int i40iw_arp_table(struct i40iw_device *iwdev,
- __be32 *ip_addr,
+ u32 *ip_addr,
bool ipv4,
u8 *mac_addr,
u32 action)
@@ -152,7 +152,7 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
struct net_device *upper_dev;
struct i40iw_device *iwdev;
struct i40iw_handler *hdl;
- __be32 local_ipaddr;
+ u32 local_ipaddr;
hdl = i40iw_find_netdev(event_netdev);
if (!hdl)
@@ -167,11 +167,10 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
switch (event) {
case NETDEV_DOWN:
if (upper_dev)
- local_ipaddr =
- ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address;
+ local_ipaddr = ntohl(
+ ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address);
else
- local_ipaddr = ifa->ifa_address;
- local_ipaddr = ntohl(local_ipaddr);
+ local_ipaddr = ntohl(ifa->ifa_address);
i40iw_manage_arp_cache(iwdev,
netdev->dev_addr,
&local_ipaddr,
@@ -180,11 +179,10 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
return NOTIFY_OK;
case NETDEV_UP:
if (upper_dev)
- local_ipaddr =
- ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address;
+ local_ipaddr = ntohl(
+ ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address);
else
- local_ipaddr = ifa->ifa_address;
- local_ipaddr = ntohl(local_ipaddr);
+ local_ipaddr = ntohl(ifa->ifa_address);
i40iw_manage_arp_cache(iwdev,
netdev->dev_addr,
&local_ipaddr,
@@ -194,12 +192,11 @@ int i40iw_inetaddr_event(struct notifier_block *notifier,
case NETDEV_CHANGEADDR:
/* Add the address to the IP table */
if (upper_dev)
- local_ipaddr =
- ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address;
+ local_ipaddr = ntohl(
+ ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address);
else
- local_ipaddr = ifa->ifa_address;
+ local_ipaddr = ntohl(ifa->ifa_address);
- local_ipaddr = ntohl(local_ipaddr);
i40iw_manage_arp_cache(iwdev,
netdev->dev_addr,
&local_ipaddr,
@@ -227,7 +224,7 @@ int i40iw_inet6addr_event(struct notifier_block *notifier,
struct net_device *netdev;
struct i40iw_device *iwdev;
struct i40iw_handler *hdl;
- __be32 local_ipaddr6[4];
+ u32 local_ipaddr6[4];
hdl = i40iw_find_netdev(event_netdev);
if (!hdl)
@@ -506,14 +503,19 @@ void i40iw_rem_ref(struct ib_qp *ibqp)
struct cqp_commands_info *cqp_info;
struct i40iw_device *iwdev;
u32 qp_num;
+ unsigned long flags;
iwqp = to_iwqp(ibqp);
- if (!atomic_dec_and_test(&iwqp->refcount))
+ iwdev = iwqp->iwdev;
+ spin_lock_irqsave(&iwdev->qptable_lock, flags);
+ if (!atomic_dec_and_test(&iwqp->refcount)) {
+ spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
return;
+ }
- iwdev = iwqp->iwdev;
qp_num = iwqp->ibqp.qp_num;
iwdev->qp_table[qp_num] = NULL;
+ spin_unlock_irqrestore(&iwdev->qptable_lock, flags);
cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false);
if (!cqp_request)
return;
@@ -671,8 +673,11 @@ enum i40iw_status_code i40iw_free_virt_mem(struct i40iw_hw *hw,
{
if (!mem)
return I40IW_ERR_PARAM;
+ /*
+ * mem->va points to the parent of mem, so both mem and mem->va
+ * can not be touched once mem->va is freed
+ */
kfree(mem->va);
- mem->va = NULL;
return 0;
}
@@ -985,21 +990,24 @@ enum i40iw_status_code i40iw_cqp_commit_fpm_values_cmd(struct i40iw_sc_dev *dev,
enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev)
{
struct i40iw_device *iwdev = dev->back_dev;
- enum i40iw_status_code err_code = 0;
int timeout_ret;
i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s[%u] dev %p, iwdev %p\n",
__func__, __LINE__, dev, iwdev);
- atomic_add(2, &iwdev->vchnl_msgs);
+
+ atomic_set(&iwdev->vchnl_msgs, 2);
timeout_ret = wait_event_timeout(iwdev->vchnl_waitq,
(atomic_read(&iwdev->vchnl_msgs) == 1),
I40IW_VCHNL_EVENT_TIMEOUT);
atomic_dec(&iwdev->vchnl_msgs);
if (!timeout_ret) {
i40iw_pr_err("virt channel completion timeout = 0x%x\n", timeout_ret);
- err_code = I40IW_ERR_TIMEOUT;
+ atomic_set(&iwdev->vchnl_msgs, 0);
+ dev->vchnl_up = false;
+ return I40IW_ERR_TIMEOUT;
}
- return err_code;
+ wake_up(&dev->vf_reqs);
+ return 0;
}
/**
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 1fe3b84a06e4..6329c971c22f 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -63,8 +63,8 @@ static int i40iw_query_device(struct ib_device *ibdev,
ether_addr_copy((u8 *)&props->sys_image_guid, iwdev->netdev->dev_addr);
props->fw_ver = I40IW_FW_VERSION;
props->device_cap_flags = iwdev->device_cap_flags;
- props->vendor_id = iwdev->vendor_id;
- props->vendor_part_id = iwdev->vendor_part_id;
+ props->vendor_id = iwdev->ldev->pcidev->vendor;
+ props->vendor_part_id = iwdev->ldev->pcidev->device;
props->hw_ver = (u32)iwdev->sc_dev.hw_rev;
props->max_mr_size = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
props->max_qp = iwdev->max_qp;
@@ -74,11 +74,12 @@ static int i40iw_query_device(struct ib_device *ibdev,
props->max_cqe = iwdev->max_cqe;
props->max_mr = iwdev->max_mr;
props->max_pd = iwdev->max_pd;
- props->max_sge_rd = 1;
+ props->max_sge_rd = I40IW_MAX_SGE_RD;
props->max_qp_rd_atom = I40IW_MAX_IRD_SIZE;
props->max_qp_init_rd_atom = props->max_qp_rd_atom;
props->atomic_cap = IB_ATOMIC_NONE;
props->max_map_per_fmr = 1;
+ props->max_fast_reg_page_list_len = I40IW_MAX_PAGES_PER_FMR;
return 0;
}
@@ -120,7 +121,7 @@ static int i40iw_query_port(struct ib_device *ibdev,
props->pkey_tbl_len = 1;
props->active_width = IB_WIDTH_4X;
props->active_speed = 1;
- props->max_msg_sz = 0x80000000;
+ props->max_msg_sz = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
return 0;
}
@@ -437,7 +438,6 @@ void i40iw_free_qp_resources(struct i40iw_device *iwdev,
kfree(iwqp->kqp.wrid_mem);
iwqp->kqp.wrid_mem = NULL;
kfree(iwqp->allocated_buffer);
- iwqp->allocated_buffer = NULL;
}
/**
@@ -521,17 +521,15 @@ static int i40iw_setup_kmode_qp(struct i40iw_device *iwdev,
enum i40iw_status_code status;
struct i40iw_qp_uk_init_info *ukinfo = &info->qp_uk_init_info;
- ukinfo->max_sq_frag_cnt = I40IW_MAX_WQ_FRAGMENT_COUNT;
-
sq_size = i40iw_qp_roundup(ukinfo->sq_size + 1);
rq_size = i40iw_qp_roundup(ukinfo->rq_size + 1);
- status = i40iw_get_wqe_shift(sq_size, ukinfo->max_sq_frag_cnt, &sqshift);
+ status = i40iw_get_wqe_shift(sq_size, ukinfo->max_sq_frag_cnt, ukinfo->max_inline_data, &sqshift);
if (!status)
- status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, &rqshift);
+ status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, 0, &rqshift);
if (status)
- return -ENOSYS;
+ return -ENOMEM;
sqdepth = sq_size << sqshift;
rqdepth = rq_size << rqshift;
@@ -609,6 +607,9 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
if (init_attr->cap.max_inline_data > I40IW_MAX_INLINE_DATA_SIZE)
init_attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE;
+ if (init_attr->cap.max_send_sge > I40IW_MAX_WQ_FRAGMENT_COUNT)
+ init_attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
+
memset(&init_info, 0, sizeof(init_info));
sq_size = init_attr->cap.max_send_wr;
@@ -618,6 +619,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
init_info.qp_uk_init_info.rq_size = rq_size;
init_info.qp_uk_init_info.max_sq_frag_cnt = init_attr->cap.max_send_sge;
init_info.qp_uk_init_info.max_rq_frag_cnt = init_attr->cap.max_recv_sge;
+ init_info.qp_uk_init_info.max_inline_data = init_attr->cap.max_inline_data;
mem = kzalloc(sizeof(*iwqp), GFP_KERNEL);
if (!mem)
@@ -669,7 +671,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
iwqp->ctx_info.qp_compl_ctx = (uintptr_t)qp;
if (init_attr->qp_type != IB_QPT_RC) {
- err_code = -ENOSYS;
+ err_code = -EINVAL;
goto error;
}
if (iwdev->push_mode)
@@ -722,8 +724,10 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
iwarp_info = &iwqp->iwarp_info;
iwarp_info->rd_enable = true;
iwarp_info->wr_rdresp_en = true;
- if (!iwqp->user_mode)
+ if (!iwqp->user_mode) {
+ iwarp_info->fast_reg_en = true;
iwarp_info->priv_mode_en = true;
+ }
iwarp_info->ddp_ver = 1;
iwarp_info->rdmap_ver = 1;
@@ -784,11 +788,12 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
return ERR_PTR(err_code);
}
}
+ init_completion(&iwqp->sq_drained);
+ init_completion(&iwqp->rq_drained);
return &iwqp->ibqp;
error:
i40iw_free_qp_resources(iwdev, iwqp, qp_num);
- kfree(mem);
return ERR_PTR(err_code);
}
@@ -1444,6 +1449,167 @@ static int i40iw_handle_q_mem(struct i40iw_device *iwdev,
}
/**
+ * i40iw_hw_alloc_stag - cqp command to allocate stag
+ * @iwdev: iwarp device
+ * @iwmr: iwarp mr pointer
+ */
+static int i40iw_hw_alloc_stag(struct i40iw_device *iwdev, struct i40iw_mr *iwmr)
+{
+ struct i40iw_allocate_stag_info *info;
+ struct i40iw_pd *iwpd = to_iwpd(iwmr->ibmr.pd);
+ enum i40iw_status_code status;
+ int err = 0;
+ struct i40iw_cqp_request *cqp_request;
+ struct cqp_commands_info *cqp_info;
+
+ cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true);
+ if (!cqp_request)
+ return -ENOMEM;
+
+ cqp_info = &cqp_request->info;
+ info = &cqp_info->in.u.alloc_stag.info;
+ memset(info, 0, sizeof(*info));
+ info->page_size = PAGE_SIZE;
+ info->stag_idx = iwmr->stag >> I40IW_CQPSQ_STAG_IDX_SHIFT;
+ info->pd_id = iwpd->sc_pd.pd_id;
+ info->total_len = iwmr->length;
+ info->remote_access = true;
+ cqp_info->cqp_cmd = OP_ALLOC_STAG;
+ cqp_info->post_sq = 1;
+ cqp_info->in.u.alloc_stag.dev = &iwdev->sc_dev;
+ cqp_info->in.u.alloc_stag.scratch = (uintptr_t)cqp_request;
+
+ status = i40iw_handle_cqp_op(iwdev, cqp_request);
+ if (status) {
+ err = -ENOMEM;
+ i40iw_pr_err("CQP-OP MR Reg fail");
+ }
+ return err;
+}
+
+/**
+ * i40iw_alloc_mr - register stag for fast memory registration
+ * @pd: ibpd pointer
+ * @mr_type: memory for stag registrion
+ * @max_num_sg: man number of pages
+ */
+static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct i40iw_pd *iwpd = to_iwpd(pd);
+ struct i40iw_device *iwdev = to_iwdev(pd->device);
+ struct i40iw_pble_alloc *palloc;
+ struct i40iw_pbl *iwpbl;
+ struct i40iw_mr *iwmr;
+ enum i40iw_status_code status;
+ u32 stag;
+ int err_code = -ENOMEM;
+
+ iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL);
+ if (!iwmr)
+ return ERR_PTR(-ENOMEM);
+
+ stag = i40iw_create_stag(iwdev);
+ if (!stag) {
+ err_code = -EOVERFLOW;
+ goto err;
+ }
+ iwmr->stag = stag;
+ iwmr->ibmr.rkey = stag;
+ iwmr->ibmr.lkey = stag;
+ iwmr->ibmr.pd = pd;
+ iwmr->ibmr.device = pd->device;
+ iwpbl = &iwmr->iwpbl;
+ iwpbl->iwmr = iwmr;
+ iwmr->type = IW_MEMREG_TYPE_MEM;
+ palloc = &iwpbl->pble_alloc;
+ iwmr->page_cnt = max_num_sg;
+ mutex_lock(&iwdev->pbl_mutex);
+ status = i40iw_get_pble(&iwdev->sc_dev, iwdev->pble_rsrc, palloc, iwmr->page_cnt);
+ mutex_unlock(&iwdev->pbl_mutex);
+ if (status)
+ goto err1;
+
+ if (palloc->level != I40IW_LEVEL_1)
+ goto err2;
+ err_code = i40iw_hw_alloc_stag(iwdev, iwmr);
+ if (err_code)
+ goto err2;
+ iwpbl->pbl_allocated = true;
+ i40iw_add_pdusecount(iwpd);
+ return &iwmr->ibmr;
+err2:
+ i40iw_free_pble(iwdev->pble_rsrc, palloc);
+err1:
+ i40iw_free_stag(iwdev, stag);
+err:
+ kfree(iwmr);
+ return ERR_PTR(err_code);
+}
+
+/**
+ * i40iw_set_page - populate pbl list for fmr
+ * @ibmr: ib mem to access iwarp mr pointer
+ * @addr: page dma address fro pbl list
+ */
+static int i40iw_set_page(struct ib_mr *ibmr, u64 addr)
+{
+ struct i40iw_mr *iwmr = to_iwmr(ibmr);
+ struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
+ struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
+ u64 *pbl;
+
+ if (unlikely(iwmr->npages == iwmr->page_cnt))
+ return -ENOMEM;
+
+ pbl = (u64 *)palloc->level1.addr;
+ pbl[iwmr->npages++] = cpu_to_le64(addr);
+ return 0;
+}
+
+/**
+ * i40iw_map_mr_sg - map of sg list for fmr
+ * @ibmr: ib mem to access iwarp mr pointer
+ * @sg: scatter gather list for fmr
+ * @sg_nents: number of sg pages
+ */
+static int i40iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+ int sg_nents, unsigned int *sg_offset)
+{
+ struct i40iw_mr *iwmr = to_iwmr(ibmr);
+
+ iwmr->npages = 0;
+ return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, i40iw_set_page);
+}
+
+/**
+ * i40iw_drain_sq - drain the send queue
+ * @ibqp: ib qp pointer
+ */
+static void i40iw_drain_sq(struct ib_qp *ibqp)
+{
+ struct i40iw_qp *iwqp = to_iwqp(ibqp);
+ struct i40iw_sc_qp *qp = &iwqp->sc_qp;
+
+ if (I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring))
+ wait_for_completion(&iwqp->sq_drained);
+}
+
+/**
+ * i40iw_drain_rq - drain the receive queue
+ * @ibqp: ib qp pointer
+ */
+static void i40iw_drain_rq(struct ib_qp *ibqp)
+{
+ struct i40iw_qp *iwqp = to_iwqp(ibqp);
+ struct i40iw_sc_qp *qp = &iwqp->sc_qp;
+
+ if (I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring))
+ wait_for_completion(&iwqp->rq_drained);
+}
+
+/**
* i40iw_hwreg_mr - send cqp command for memory registration
* @iwdev: iwarp device
* @iwmr: iwarp mr pointer
@@ -1526,14 +1692,16 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
struct i40iw_mr *iwmr;
struct ib_umem *region;
struct i40iw_mem_reg_req req;
- u32 pbl_depth = 0;
+ u64 pbl_depth = 0;
u32 stag = 0;
u16 access;
- u32 region_length;
+ u64 region_length;
bool use_pbles = false;
unsigned long flags;
int err = -ENOSYS;
+ if (length > I40IW_MAX_MR_SIZE)
+ return ERR_PTR(-EINVAL);
region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
if (IS_ERR(region))
return (struct ib_mr *)region;
@@ -1564,7 +1732,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
palloc = &iwpbl->pble_alloc;
iwmr->type = req.reg_type;
- iwmr->page_cnt = pbl_depth;
+ iwmr->page_cnt = (u32)pbl_depth;
switch (req.reg_type) {
case IW_MEMREG_TYPE_QP:
@@ -1671,6 +1839,7 @@ struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *pd,
iwmr->ibmr.lkey = stag;
iwmr->page_cnt = 1;
iwmr->pgaddrmem[0] = addr;
+ iwmr->length = size;
status = i40iw_hwreg_mr(iwdev, iwmr, access);
if (status) {
i40iw_free_stag(iwdev, stag);
@@ -1694,7 +1863,7 @@ static struct ib_mr *i40iw_get_dma_mr(struct ib_pd *pd, int acc)
{
u64 kva = 0;
- return i40iw_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva);
+ return i40iw_reg_phys_mr(pd, 0, 0, acc, &kva);
}
/**
@@ -1756,8 +1925,7 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr)
}
if (iwpbl->pbl_allocated)
i40iw_free_pble(iwdev->pble_rsrc, palloc);
- kfree(iwpbl->iwmr);
- iwpbl->iwmr = NULL;
+ kfree(iwmr);
return 0;
}
@@ -1806,18 +1974,6 @@ static ssize_t i40iw_show_rev(struct device *dev,
}
/**
- * i40iw_show_fw_ver
- */
-static ssize_t i40iw_show_fw_ver(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- u32 firmware_version = I40IW_FW_VERSION;
-
- return sprintf(buf, "%u.%u\n", firmware_version,
- (firmware_version & 0x000000ff));
-}
-
-/**
* i40iw_show_hca
*/
static ssize_t i40iw_show_hca(struct device *dev,
@@ -1837,13 +1993,11 @@ static ssize_t i40iw_show_board(struct device *dev,
}
static DEVICE_ATTR(hw_rev, S_IRUGO, i40iw_show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, i40iw_show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, i40iw_show_hca, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, i40iw_show_board, NULL);
static struct device_attribute *i40iw_dev_attributes[] = {
&dev_attr_hw_rev,
- &dev_attr_fw_ver,
&dev_attr_hca_type,
&dev_attr_board_id
};
@@ -1881,12 +2035,14 @@ static int i40iw_post_send(struct ib_qp *ibqp,
enum i40iw_status_code ret;
int err = 0;
unsigned long flags;
+ bool inv_stag;
iwqp = (struct i40iw_qp *)ibqp;
ukqp = &iwqp->sc_qp.qp_uk;
spin_lock_irqsave(&iwqp->lock, flags);
while (ib_wr) {
+ inv_stag = false;
memset(&info, 0, sizeof(info));
info.wr_id = (u64)(ib_wr->wr_id);
if ((ib_wr->send_flags & IB_SEND_SIGNALED) || iwqp->sig_all)
@@ -1896,23 +2052,36 @@ static int i40iw_post_send(struct ib_qp *ibqp,
switch (ib_wr->opcode) {
case IB_WR_SEND:
- if (ib_wr->send_flags & IB_SEND_SOLICITED)
- info.op_type = I40IW_OP_TYPE_SEND_SOL;
- else
- info.op_type = I40IW_OP_TYPE_SEND;
+ /* fall-through */
+ case IB_WR_SEND_WITH_INV:
+ if (ib_wr->opcode == IB_WR_SEND) {
+ if (ib_wr->send_flags & IB_SEND_SOLICITED)
+ info.op_type = I40IW_OP_TYPE_SEND_SOL;
+ else
+ info.op_type = I40IW_OP_TYPE_SEND;
+ } else {
+ if (ib_wr->send_flags & IB_SEND_SOLICITED)
+ info.op_type = I40IW_OP_TYPE_SEND_SOL_INV;
+ else
+ info.op_type = I40IW_OP_TYPE_SEND_INV;
+ }
if (ib_wr->send_flags & IB_SEND_INLINE) {
info.op.inline_send.data = (void *)(unsigned long)ib_wr->sg_list[0].addr;
info.op.inline_send.len = ib_wr->sg_list[0].length;
- ret = ukqp->ops.iw_inline_send(ukqp, &info, rdma_wr(ib_wr)->rkey, false);
+ ret = ukqp->ops.iw_inline_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
} else {
info.op.send.num_sges = ib_wr->num_sge;
info.op.send.sg_list = (struct i40iw_sge *)ib_wr->sg_list;
- ret = ukqp->ops.iw_send(ukqp, &info, rdma_wr(ib_wr)->rkey, false);
+ ret = ukqp->ops.iw_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false);
}
- if (ret)
- err = -EIO;
+ if (ret) {
+ if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+ err = -ENOMEM;
+ else
+ err = -EINVAL;
+ }
break;
case IB_WR_RDMA_WRITE:
info.op_type = I40IW_OP_TYPE_RDMA_WRITE;
@@ -1933,10 +2102,21 @@ static int i40iw_post_send(struct ib_qp *ibqp,
ret = ukqp->ops.iw_rdma_write(ukqp, &info, false);
}
- if (ret)
- err = -EIO;
+ if (ret) {
+ if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+ err = -ENOMEM;
+ else
+ err = -EINVAL;
+ }
break;
+ case IB_WR_RDMA_READ_WITH_INV:
+ inv_stag = true;
+ /* fall-through*/
case IB_WR_RDMA_READ:
+ if (ib_wr->num_sge > I40IW_MAX_SGE_RD) {
+ err = -EINVAL;
+ break;
+ }
info.op_type = I40IW_OP_TYPE_RDMA_READ;
info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr;
info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey;
@@ -1944,10 +2124,56 @@ static int i40iw_post_send(struct ib_qp *ibqp,
info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr;
info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey;
info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length;
- ret = ukqp->ops.iw_rdma_read(ukqp, &info, false, false);
+ ret = ukqp->ops.iw_rdma_read(ukqp, &info, inv_stag, false);
+ if (ret) {
+ if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+ err = -ENOMEM;
+ else
+ err = -EINVAL;
+ }
+ break;
+ case IB_WR_LOCAL_INV:
+ info.op_type = I40IW_OP_TYPE_INV_STAG;
+ info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey;
+ ret = ukqp->ops.iw_stag_local_invalidate(ukqp, &info, true);
if (ret)
- err = -EIO;
+ err = -ENOMEM;
break;
+ case IB_WR_REG_MR:
+ {
+ struct i40iw_mr *iwmr = to_iwmr(reg_wr(ib_wr)->mr);
+ int page_shift = ilog2(reg_wr(ib_wr)->mr->page_size);
+ int flags = reg_wr(ib_wr)->access;
+ struct i40iw_pble_alloc *palloc = &iwmr->iwpbl.pble_alloc;
+ struct i40iw_sc_dev *dev = &iwqp->iwdev->sc_dev;
+ struct i40iw_fast_reg_stag_info info;
+
+ memset(&info, 0, sizeof(info));
+ info.access_rights = I40IW_ACCESS_FLAGS_LOCALREAD;
+ info.access_rights |= i40iw_get_user_access(flags);
+ info.stag_key = reg_wr(ib_wr)->key & 0xff;
+ info.stag_idx = reg_wr(ib_wr)->key >> 8;
+ info.wr_id = ib_wr->wr_id;
+
+ info.addr_type = I40IW_ADDR_TYPE_VA_BASED;
+ info.va = (void *)(uintptr_t)iwmr->ibmr.iova;
+ info.total_len = iwmr->ibmr.length;
+ info.reg_addr_pa = *(u64 *)palloc->level1.addr;
+ info.first_pm_pbl_index = palloc->level1.idx;
+ info.local_fence = ib_wr->send_flags & IB_SEND_FENCE;
+ info.signaled = ib_wr->send_flags & IB_SEND_SIGNALED;
+
+ if (iwmr->npages > I40IW_MIN_PAGES_PER_FMR)
+ info.chunk_size = 1;
+
+ if (page_shift == 21)
+ info.page_size = 1; /* 2M page */
+
+ ret = dev->iw_priv_qp_ops->iw_mr_fast_register(&iwqp->sc_qp, &info, true);
+ if (ret)
+ err = -ENOMEM;
+ break;
+ }
default:
err = -EINVAL;
i40iw_pr_err(" upost_send bad opcode = 0x%x\n",
@@ -1985,6 +2211,7 @@ static int i40iw_post_recv(struct ib_qp *ibqp,
struct i40iw_sge sg_list[I40IW_MAX_WQ_FRAGMENT_COUNT];
enum i40iw_status_code ret = 0;
unsigned long flags;
+ int err = 0;
iwqp = (struct i40iw_qp *)ibqp;
ukqp = &iwqp->sc_qp.qp_uk;
@@ -1999,6 +2226,10 @@ static int i40iw_post_recv(struct ib_qp *ibqp,
ret = ukqp->ops.iw_post_receive(ukqp, &post_recv);
if (ret) {
i40iw_pr_err(" post_recv err %d\n", ret);
+ if (ret == I40IW_ERR_QP_TOOMANY_WRS_POSTED)
+ err = -ENOMEM;
+ else
+ err = -EINVAL;
*bad_wr = ib_wr;
goto out;
}
@@ -2006,9 +2237,7 @@ static int i40iw_post_recv(struct ib_qp *ibqp,
}
out:
spin_unlock_irqrestore(&iwqp->lock, flags);
- if (ret)
- return -ENOSYS;
- return 0;
+ return err;
}
/**
@@ -2027,6 +2256,7 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
enum i40iw_status_code ret;
struct i40iw_cq_uk *ukcq;
struct i40iw_sc_qp *qp;
+ struct i40iw_qp *iwqp;
unsigned long flags;
iwcq = (struct i40iw_cq *)ibcq;
@@ -2034,9 +2264,11 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
spin_lock_irqsave(&iwcq->lock, flags);
while (cqe_count < num_entries) {
- ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info, true);
+ ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info);
if (ret == I40IW_ERR_QUEUE_EMPTY) {
break;
+ } else if (ret == I40IW_ERR_QUEUE_DESTROYED) {
+ continue;
} else if (ret) {
if (!cqe_count)
cqe_count = -1;
@@ -2044,10 +2276,12 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
}
entry->wc_flags = 0;
entry->wr_id = cq_poll_info.wr_id;
- if (!cq_poll_info.error)
- entry->status = IB_WC_SUCCESS;
- else
+ if (cq_poll_info.error) {
entry->status = IB_WC_WR_FLUSH_ERR;
+ entry->vendor_err = cq_poll_info.major_err << 16 | cq_poll_info.minor_err;
+ } else {
+ entry->status = IB_WC_SUCCESS;
+ }
switch (cq_poll_info.op_type) {
case I40IW_OP_TYPE_RDMA_WRITE:
@@ -2071,12 +2305,17 @@ static int i40iw_poll_cq(struct ib_cq *ibcq,
break;
}
- entry->vendor_err =
- cq_poll_info.major_err << 16 | cq_poll_info.minor_err;
entry->ex.imm_data = 0;
qp = (struct i40iw_sc_qp *)cq_poll_info.qp_handle;
entry->qp = (struct ib_qp *)qp->back_qp;
entry->src_qp = cq_poll_info.qp_id;
+ iwqp = (struct i40iw_qp *)qp->back_qp;
+ if (iwqp->iwarp_state > I40IW_QP_STATE_RTS) {
+ if (!I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring))
+ complete(&iwqp->sq_drained);
+ if (!I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring))
+ complete(&iwqp->rq_drained);
+ }
entry->byte_len = cq_poll_info.bytes_xfered;
entry++;
cqe_count++;
@@ -2095,13 +2334,16 @@ static int i40iw_req_notify_cq(struct ib_cq *ibcq,
{
struct i40iw_cq *iwcq;
struct i40iw_cq_uk *ukcq;
- enum i40iw_completion_notify cq_notify = IW_CQ_COMPL_SOLICITED;
+ unsigned long flags;
+ enum i40iw_completion_notify cq_notify = IW_CQ_COMPL_EVENT;
iwcq = (struct i40iw_cq *)ibcq;
ukcq = &iwcq->sc_cq.cq_uk;
- if (notify_flags == IB_CQ_NEXT_COMP)
- cq_notify = IW_CQ_COMPL_EVENT;
+ if (notify_flags == IB_CQ_SOLICITED)
+ cq_notify = IW_CQ_COMPL_SOLICITED;
+ spin_lock_irqsave(&iwcq->lock, flags);
ukcq->ops.iw_cq_request_notification(ukcq, cq_notify);
+ spin_unlock_irqrestore(&iwcq->lock, flags);
return 0;
}
@@ -2129,62 +2371,139 @@ static int i40iw_port_immutable(struct ib_device *ibdev, u8 port_num,
return 0;
}
+static const char * const i40iw_hw_stat_names[] = {
+ // 32bit names
+ [I40IW_HW_STAT_INDEX_IP4RXDISCARD] = "ip4InDiscards",
+ [I40IW_HW_STAT_INDEX_IP4RXTRUNC] = "ip4InTruncatedPkts",
+ [I40IW_HW_STAT_INDEX_IP4TXNOROUTE] = "ip4OutNoRoutes",
+ [I40IW_HW_STAT_INDEX_IP6RXDISCARD] = "ip6InDiscards",
+ [I40IW_HW_STAT_INDEX_IP6RXTRUNC] = "ip6InTruncatedPkts",
+ [I40IW_HW_STAT_INDEX_IP6TXNOROUTE] = "ip6OutNoRoutes",
+ [I40IW_HW_STAT_INDEX_TCPRTXSEG] = "tcpRetransSegs",
+ [I40IW_HW_STAT_INDEX_TCPRXOPTERR] = "tcpInOptErrors",
+ [I40IW_HW_STAT_INDEX_TCPRXPROTOERR] = "tcpInProtoErrors",
+ // 64bit names
+ [I40IW_HW_STAT_INDEX_IP4RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip4InOctets",
+ [I40IW_HW_STAT_INDEX_IP4RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip4InPkts",
+ [I40IW_HW_STAT_INDEX_IP4RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip4InReasmRqd",
+ [I40IW_HW_STAT_INDEX_IP4RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip4InMcastPkts",
+ [I40IW_HW_STAT_INDEX_IP4TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip4OutOctets",
+ [I40IW_HW_STAT_INDEX_IP4TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip4OutPkts",
+ [I40IW_HW_STAT_INDEX_IP4TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip4OutSegRqd",
+ [I40IW_HW_STAT_INDEX_IP4TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip4OutMcastPkts",
+ [I40IW_HW_STAT_INDEX_IP6RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip6InOctets",
+ [I40IW_HW_STAT_INDEX_IP6RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip6InPkts",
+ [I40IW_HW_STAT_INDEX_IP6RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip6InReasmRqd",
+ [I40IW_HW_STAT_INDEX_IP6RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip6InMcastPkts",
+ [I40IW_HW_STAT_INDEX_IP6TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip6OutOctets",
+ [I40IW_HW_STAT_INDEX_IP6TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip6OutPkts",
+ [I40IW_HW_STAT_INDEX_IP6TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip6OutSegRqd",
+ [I40IW_HW_STAT_INDEX_IP6TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "ip6OutMcastPkts",
+ [I40IW_HW_STAT_INDEX_TCPRXSEGS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "tcpInSegs",
+ [I40IW_HW_STAT_INDEX_TCPTXSEG + I40IW_HW_STAT_INDEX_MAX_32] =
+ "tcpOutSegs",
+ [I40IW_HW_STAT_INDEX_RDMARXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "iwInRdmaReads",
+ [I40IW_HW_STAT_INDEX_RDMARXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "iwInRdmaSends",
+ [I40IW_HW_STAT_INDEX_RDMARXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "iwInRdmaWrites",
+ [I40IW_HW_STAT_INDEX_RDMATXRDS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "iwOutRdmaReads",
+ [I40IW_HW_STAT_INDEX_RDMATXSNDS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "iwOutRdmaSends",
+ [I40IW_HW_STAT_INDEX_RDMATXWRS + I40IW_HW_STAT_INDEX_MAX_32] =
+ "iwOutRdmaWrites",
+ [I40IW_HW_STAT_INDEX_RDMAVBND + I40IW_HW_STAT_INDEX_MAX_32] =
+ "iwRdmaBnd",
+ [I40IW_HW_STAT_INDEX_RDMAVINV + I40IW_HW_STAT_INDEX_MAX_32] =
+ "iwRdmaInv"
+};
+
+static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str,
+ size_t str_len)
+{
+ u32 firmware_version = I40IW_FW_VERSION;
+
+ snprintf(str, str_len, "%u.%u", firmware_version,
+ (firmware_version & 0x000000ff));
+}
+
/**
- * i40iw_get_protocol_stats - Populates the rdma_stats structure
- * @ibdev: ib dev struct
- * @stats: iw protocol stats struct
+ * i40iw_alloc_hw_stats - Allocate a hw stats structure
+ * @ibdev: device pointer from stack
+ * @port_num: port number
*/
-static int i40iw_get_protocol_stats(struct ib_device *ibdev,
- union rdma_protocol_stats *stats)
+static struct rdma_hw_stats *i40iw_alloc_hw_stats(struct ib_device *ibdev,
+ u8 port_num)
+{
+ struct i40iw_device *iwdev = to_iwdev(ibdev);
+ struct i40iw_sc_dev *dev = &iwdev->sc_dev;
+ int num_counters = I40IW_HW_STAT_INDEX_MAX_32 +
+ I40IW_HW_STAT_INDEX_MAX_64;
+ unsigned long lifespan = RDMA_HW_STATS_DEFAULT_LIFESPAN;
+
+ BUILD_BUG_ON(ARRAY_SIZE(i40iw_hw_stat_names) !=
+ (I40IW_HW_STAT_INDEX_MAX_32 +
+ I40IW_HW_STAT_INDEX_MAX_64));
+
+ /*
+ * PFs get the default update lifespan, but VFs only update once
+ * per second
+ */
+ if (!dev->is_pf)
+ lifespan = 1000;
+ return rdma_alloc_hw_stats_struct(i40iw_hw_stat_names, num_counters,
+ lifespan);
+}
+
+/**
+ * i40iw_get_hw_stats - Populates the rdma_hw_stats structure
+ * @ibdev: device pointer from stack
+ * @stats: stats pointer from stack
+ * @port_num: port number
+ * @index: which hw counter the stack is requesting we update
+ */
+static int i40iw_get_hw_stats(struct ib_device *ibdev,
+ struct rdma_hw_stats *stats,
+ u8 port_num, int index)
{
struct i40iw_device *iwdev = to_iwdev(ibdev);
struct i40iw_sc_dev *dev = &iwdev->sc_dev;
struct i40iw_dev_pestat *devstat = &dev->dev_pestat;
struct i40iw_dev_hw_stats *hw_stats = &devstat->hw_stats;
- struct timespec curr_time;
- static struct timespec last_rd_time = {0, 0};
- enum i40iw_status_code status = 0;
unsigned long flags;
- curr_time = current_kernel_time();
- memset(stats, 0, sizeof(*stats));
-
if (dev->is_pf) {
spin_lock_irqsave(&devstat->stats_lock, flags);
devstat->ops.iw_hw_stat_read_all(devstat,
&devstat->hw_stats);
spin_unlock_irqrestore(&devstat->stats_lock, flags);
} else {
- if (((u64)curr_time.tv_sec - (u64)last_rd_time.tv_sec) > 1)
- status = i40iw_vchnl_vf_get_pe_stats(dev,
- &devstat->hw_stats);
-
- if (status)
+ if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats))
return -ENOSYS;
}
- stats->iw.ipInReceives = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] +
- hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXPKTS];
- stats->iw.ipInTruncatedPkts = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXTRUNC] +
- hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXTRUNC];
- stats->iw.ipInDiscards = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXDISCARD] +
- hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXDISCARD];
- stats->iw.ipOutNoRoutes = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] +
- hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6TXNOROUTE];
- stats->iw.ipReasmReqds = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXFRAGS] +
- hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXFRAGS];
- stats->iw.ipFragCreates = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXFRAGS] +
- hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXFRAGS];
- stats->iw.ipInMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXMCPKTS] +
- hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXMCPKTS];
- stats->iw.ipOutMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXMCPKTS] +
- hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXMCPKTS];
- stats->iw.tcpOutSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPTXSEG];
- stats->iw.tcpInSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPRXSEGS];
- stats->iw.tcpRetransSegs = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_TCPRTXSEG];
-
- last_rd_time = curr_time;
- return 0;
+ memcpy(&stats->value[0], &hw_stats, sizeof(*hw_stats));
+
+ return stats->num_counters;
}
/**
@@ -2218,7 +2537,7 @@ static int i40iw_modify_port(struct ib_device *ibdev,
int port_modify_mask,
struct ib_port_modify *props)
{
- return 0;
+ return -ENOSYS;
}
/**
@@ -2323,10 +2642,15 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
iwibdev->ibdev.get_dma_mr = i40iw_get_dma_mr;
iwibdev->ibdev.reg_user_mr = i40iw_reg_user_mr;
iwibdev->ibdev.dereg_mr = i40iw_dereg_mr;
- iwibdev->ibdev.get_protocol_stats = i40iw_get_protocol_stats;
+ iwibdev->ibdev.alloc_hw_stats = i40iw_alloc_hw_stats;
+ iwibdev->ibdev.get_hw_stats = i40iw_get_hw_stats;
iwibdev->ibdev.query_device = i40iw_query_device;
iwibdev->ibdev.create_ah = i40iw_create_ah;
iwibdev->ibdev.destroy_ah = i40iw_destroy_ah;
+ iwibdev->ibdev.drain_sq = i40iw_drain_sq;
+ iwibdev->ibdev.drain_rq = i40iw_drain_rq;
+ iwibdev->ibdev.alloc_mr = i40iw_alloc_mr;
+ iwibdev->ibdev.map_mr_sg = i40iw_map_mr_sg;
iwibdev->ibdev.iwcm = kzalloc(sizeof(*iwibdev->ibdev.iwcm), GFP_KERNEL);
if (!iwibdev->ibdev.iwcm) {
ib_dealloc_device(&iwibdev->ibdev);
@@ -2345,6 +2669,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
memcpy(iwibdev->ibdev.iwcm->ifname, netdev->name,
sizeof(iwibdev->ibdev.iwcm->ifname));
iwibdev->ibdev.get_port_immutable = i40iw_port_immutable;
+ iwibdev->ibdev.get_dev_fw_str = i40iw_get_dev_fw_str;
iwibdev->ibdev.poll_cq = i40iw_poll_cq;
iwibdev->ibdev.req_notify_cq = i40iw_req_notify_cq;
iwibdev->ibdev.post_send = i40iw_post_send;
@@ -2408,7 +2733,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
iwdev->iwibdev = i40iw_init_rdma_device(iwdev);
if (!iwdev->iwibdev)
- return -ENOSYS;
+ return -ENOMEM;
iwibdev = iwdev->iwibdev;
ret = ib_register_device(&iwibdev->ibdev, NULL);
@@ -2433,5 +2758,5 @@ error:
kfree(iwdev->iwibdev->ibdev.iwcm);
iwdev->iwibdev->ibdev.iwcm = NULL;
ib_dealloc_device(&iwdev->iwibdev->ibdev);
- return -ENOSYS;
+ return ret;
}
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
index 1101f77080e6..0069be8a5a38 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
@@ -92,6 +92,7 @@ struct i40iw_mr {
struct ib_umem *region;
u16 type;
u32 page_cnt;
+ u32 npages;
u32 stag;
u64 length;
u64 pgaddrmem[MAX_SAVE_PAGE_ADDRS];
@@ -169,5 +170,7 @@ struct i40iw_qp {
struct i40iw_pbl *iwpbl;
struct i40iw_dma_mem q2_ctx_mem;
struct i40iw_dma_mem ietf_mem;
+ struct completion sq_drained;
+ struct completion rq_drained;
};
#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.c b/drivers/infiniband/hw/i40iw/i40iw_vf.c
index cb0f18340e14..e33d4810965c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_vf.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_vf.c
@@ -80,6 +80,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp,
return 0;
}
-struct i40iw_vf_cqp_ops iw_vf_cqp_ops = {
+const struct i40iw_vf_cqp_ops iw_vf_cqp_ops = {
i40iw_manage_vf_pble_bp
};
diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.h b/drivers/infiniband/hw/i40iw/i40iw_vf.h
index f649f3a62e13..4359559ece9c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_vf.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_vf.h
@@ -57,6 +57,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp,
u64 scratch,
bool post_sq);
-extern struct i40iw_vf_cqp_ops iw_vf_cqp_ops;
+extern const struct i40iw_vf_cqp_ops iw_vf_cqp_ops;
#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
index 6b68f7890b76..3041003c94d2 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
@@ -254,7 +254,7 @@ static void vchnl_pf_send_get_hmc_fcn_resp(struct i40iw_sc_dev *dev,
static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev,
u32 vf_id,
struct i40iw_virtchnl_op_buf *vchnl_msg,
- struct i40iw_dev_hw_stats hw_stats)
+ struct i40iw_dev_hw_stats *hw_stats)
{
enum i40iw_status_code ret_code;
u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(struct i40iw_dev_hw_stats) - 1];
@@ -264,7 +264,7 @@ static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev,
vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx;
vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer);
vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS;
- *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = hw_stats;
+ *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = *hw_stats;
ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer));
if (ret_code)
i40iw_debug(dev, I40IW_DEBUG_VIRT,
@@ -437,11 +437,9 @@ enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev,
vchnl_pf_send_get_ver_resp(dev, vf_id, vchnl_msg);
return I40IW_SUCCESS;
}
- for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT;
- iw_vf_idx++) {
+ for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; iw_vf_idx++) {
if (!dev->vf_dev[iw_vf_idx]) {
- if (first_avail_iw_vf ==
- I40IW_MAX_PE_ENABLED_VF_COUNT)
+ if (first_avail_iw_vf == I40IW_MAX_PE_ENABLED_VF_COUNT)
first_avail_iw_vf = iw_vf_idx;
continue;
}
@@ -541,7 +539,7 @@ enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev,
devstat->ops.iw_hw_stat_read_all(devstat, &devstat->hw_stats);
spin_unlock_irqrestore(&dev->dev_pestat.stats_lock, flags);
vf_dev->msg_count--;
- vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, devstat->hw_stats);
+ vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, &devstat->hw_stats);
break;
default:
i40iw_debug(dev, I40IW_DEBUG_VIRT,
@@ -596,23 +594,25 @@ enum i40iw_status_code i40iw_vchnl_vf_get_ver(struct i40iw_sc_dev *dev,
struct i40iw_virtchnl_req vchnl_req;
enum i40iw_status_code ret_code;
+ if (!i40iw_vf_clear_to_send(dev))
+ return I40IW_ERR_TIMEOUT;
memset(&vchnl_req, 0, sizeof(vchnl_req));
vchnl_req.dev = dev;
vchnl_req.parm = vchnl_ver;
vchnl_req.parm_len = sizeof(*vchnl_ver);
vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
ret_code = vchnl_vf_send_get_ver_req(dev, &vchnl_req);
- if (!ret_code) {
- ret_code = i40iw_vf_wait_vchnl_resp(dev);
- if (!ret_code)
- ret_code = vchnl_req.ret_code;
- else
- dev->vchnl_up = false;
- } else {
+ if (ret_code) {
i40iw_debug(dev, I40IW_DEBUG_VIRT,
"%s Send message failed 0x%0x\n", __func__, ret_code);
+ return ret_code;
}
- return ret_code;
+ ret_code = i40iw_vf_wait_vchnl_resp(dev);
+ if (ret_code)
+ return ret_code;
+ else
+ return vchnl_req.ret_code;
}
/**
@@ -626,23 +626,25 @@ enum i40iw_status_code i40iw_vchnl_vf_get_hmc_fcn(struct i40iw_sc_dev *dev,
struct i40iw_virtchnl_req vchnl_req;
enum i40iw_status_code ret_code;
+ if (!i40iw_vf_clear_to_send(dev))
+ return I40IW_ERR_TIMEOUT;
memset(&vchnl_req, 0, sizeof(vchnl_req));
vchnl_req.dev = dev;
vchnl_req.parm = hmc_fcn;
vchnl_req.parm_len = sizeof(*hmc_fcn);
vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
ret_code = vchnl_vf_send_get_hmc_fcn_req(dev, &vchnl_req);
- if (!ret_code) {
- ret_code = i40iw_vf_wait_vchnl_resp(dev);
- if (!ret_code)
- ret_code = vchnl_req.ret_code;
- else
- dev->vchnl_up = false;
- } else {
+ if (ret_code) {
i40iw_debug(dev, I40IW_DEBUG_VIRT,
"%s Send message failed 0x%0x\n", __func__, ret_code);
+ return ret_code;
}
- return ret_code;
+ ret_code = i40iw_vf_wait_vchnl_resp(dev);
+ if (ret_code)
+ return ret_code;
+ else
+ return vchnl_req.ret_code;
}
/**
@@ -660,25 +662,27 @@ enum i40iw_status_code i40iw_vchnl_vf_add_hmc_objs(struct i40iw_sc_dev *dev,
struct i40iw_virtchnl_req vchnl_req;
enum i40iw_status_code ret_code;
+ if (!i40iw_vf_clear_to_send(dev))
+ return I40IW_ERR_TIMEOUT;
memset(&vchnl_req, 0, sizeof(vchnl_req));
vchnl_req.dev = dev;
vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
ret_code = vchnl_vf_send_add_hmc_objs_req(dev,
&vchnl_req,
rsrc_type,
start_index,
rsrc_count);
- if (!ret_code) {
- ret_code = i40iw_vf_wait_vchnl_resp(dev);
- if (!ret_code)
- ret_code = vchnl_req.ret_code;
- else
- dev->vchnl_up = false;
- } else {
+ if (ret_code) {
i40iw_debug(dev, I40IW_DEBUG_VIRT,
"%s Send message failed 0x%0x\n", __func__, ret_code);
+ return ret_code;
}
- return ret_code;
+ ret_code = i40iw_vf_wait_vchnl_resp(dev);
+ if (ret_code)
+ return ret_code;
+ else
+ return vchnl_req.ret_code;
}
/**
@@ -696,25 +700,27 @@ enum i40iw_status_code i40iw_vchnl_vf_del_hmc_obj(struct i40iw_sc_dev *dev,
struct i40iw_virtchnl_req vchnl_req;
enum i40iw_status_code ret_code;
+ if (!i40iw_vf_clear_to_send(dev))
+ return I40IW_ERR_TIMEOUT;
memset(&vchnl_req, 0, sizeof(vchnl_req));
vchnl_req.dev = dev;
vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
ret_code = vchnl_vf_send_del_hmc_objs_req(dev,
&vchnl_req,
rsrc_type,
start_index,
rsrc_count);
- if (!ret_code) {
- ret_code = i40iw_vf_wait_vchnl_resp(dev);
- if (!ret_code)
- ret_code = vchnl_req.ret_code;
- else
- dev->vchnl_up = false;
- } else {
+ if (ret_code) {
i40iw_debug(dev, I40IW_DEBUG_VIRT,
"%s Send message failed 0x%0x\n", __func__, ret_code);
+ return ret_code;
}
- return ret_code;
+ ret_code = i40iw_vf_wait_vchnl_resp(dev);
+ if (ret_code)
+ return ret_code;
+ else
+ return vchnl_req.ret_code;
}
/**
@@ -728,21 +734,23 @@ enum i40iw_status_code i40iw_vchnl_vf_get_pe_stats(struct i40iw_sc_dev *dev,
struct i40iw_virtchnl_req vchnl_req;
enum i40iw_status_code ret_code;
+ if (!i40iw_vf_clear_to_send(dev))
+ return I40IW_ERR_TIMEOUT;
memset(&vchnl_req, 0, sizeof(vchnl_req));
vchnl_req.dev = dev;
vchnl_req.parm = hw_stats;
vchnl_req.parm_len = sizeof(*hw_stats);
vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg;
+
ret_code = vchnl_vf_send_get_pe_stats_req(dev, &vchnl_req);
- if (!ret_code) {
- ret_code = i40iw_vf_wait_vchnl_resp(dev);
- if (!ret_code)
- ret_code = vchnl_req.ret_code;
- else
- dev->vchnl_up = false;
- } else {
+ if (ret_code) {
i40iw_debug(dev, I40IW_DEBUG_VIRT,
"%s Send message failed 0x%0x\n", __func__, ret_code);
+ return ret_code;
}
- return ret_code;
+ ret_code = i40iw_vf_wait_vchnl_resp(dev);
+ if (ret_code)
+ return ret_code;
+ else
+ return vchnl_req.ret_code;
}
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index 105246fba2e7..5fc623362731 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -47,6 +47,7 @@ static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
ah->av.ib.g_slid = ah_attr->src_path_bits;
+ ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
if (ah_attr->ah_flags & IB_AH_GRH) {
ah->av.ib.g_slid |= 0x80;
ah->av.ib.gid_index = ah_attr->grh.sgid_index;
@@ -64,7 +65,6 @@ static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
!(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support))
--ah->av.ib.stat_rate;
}
- ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
return &ah->ibah;
}
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 9f8b516eb2b0..5df63dacaaa3 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -288,7 +288,7 @@ static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
if (cq->resize_buf)
return -EBUSY;
- cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+ cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
if (!cq->resize_buf)
return -ENOMEM;
@@ -316,7 +316,7 @@ static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq
if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
return -EFAULT;
- cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+ cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_KERNEL);
if (!cq->resize_buf)
return -ENOMEM;
@@ -576,8 +576,8 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)
checksum == cpu_to_be16(0xffff);
}
-static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
- unsigned tail, struct mlx4_cqe *cqe, int is_eth)
+static void use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
+ unsigned tail, struct mlx4_cqe *cqe, int is_eth)
{
struct mlx4_ib_proxy_sqp_hdr *hdr;
@@ -600,8 +600,6 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct
wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32);
wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
}
-
- return 0;
}
static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries,
@@ -689,12 +687,6 @@ repoll:
is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
MLX4_CQE_OPCODE_ERROR;
- if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
- is_send)) {
- pr_warn("Completion for NOP opcode detected!\n");
- return -EINVAL;
- }
-
/* Resize CQ in progress */
if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) {
if (cq->resize_buf) {
@@ -720,12 +712,6 @@ repoll:
*/
mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
be32_to_cpu(cqe->vlan_my_qpn));
- if (unlikely(!mqp)) {
- pr_warn("CQ %06x with entry for unknown QPN %06x\n",
- cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK);
- return -EINVAL;
- }
-
*cur_qp = to_mibqp(mqp);
}
@@ -738,11 +724,6 @@ repoll:
/* SRQ is also in the radix tree */
msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
srq_num);
- if (unlikely(!msrq)) {
- pr_warn("CQ %06x with entry for unknown SRQN %06x\n",
- cq->mcq.cqn, srq_num);
- return -EINVAL;
- }
}
if (is_send) {
@@ -852,9 +833,11 @@ repoll:
if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
if ((*cur_qp)->mlx4_ib_qp_type &
(MLX4_IB_QPT_PROXY_SMI_OWNER |
- MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
- return use_tunnel_data(*cur_qp, cq, wc, tail,
- cqe, is_eth);
+ MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+ use_tunnel_data(*cur_qp, cq, wc, tail, cqe,
+ is_eth);
+ return 0;
+ }
}
wc->slid = be16_to_cpu(cqe->rlid);
@@ -891,7 +874,6 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
struct mlx4_ib_qp *cur_qp = NULL;
unsigned long flags;
int npolled;
- int err = 0;
struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device);
spin_lock_irqsave(&cq->lock, flags);
@@ -901,8 +883,7 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
}
for (npolled = 0; npolled < num_entries; ++npolled) {
- err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
- if (err)
+ if (mlx4_ib_poll_one(cq, &cur_qp, wc + npolled))
break;
}
@@ -911,10 +892,7 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
out:
spin_unlock_irqrestore(&cq->lock, flags);
- if (err == 0 || err == -EAGAIN)
- return npolled;
- else
- return err;
+ return npolled;
}
int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index d68f506c1922..0f21c3a25552 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -527,7 +527,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
spin_unlock(&tun_qp->tx_lock);
if (ret)
- goto out;
+ goto end;
tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
if (tun_qp->tx_ring[tun_tx_ix].ah)
@@ -596,9 +596,15 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
wr.wr.send_flags = IB_SEND_SIGNALED;
ret = ib_post_send(src_qp, &wr.wr, &bad_wr);
-out:
- if (ret)
- ib_destroy_ah(ah);
+ if (!ret)
+ return 0;
+ out:
+ spin_lock(&tun_qp->tx_lock);
+ tun_qp->tx_ix_tail++;
+ spin_unlock(&tun_qp->tx_lock);
+ tun_qp->tx_ring[tun_tx_ix].ah = NULL;
+end:
+ ib_destroy_ah(ah);
return ret;
}
@@ -1122,6 +1128,27 @@ void handle_port_mgmt_change_event(struct work_struct *work)
/* Generate GUID changed event */
if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) {
+ if (mlx4_is_master(dev->dev)) {
+ union ib_gid gid;
+ int err = 0;
+
+ if (!eqe->event.port_mgmt_change.params.port_info.gid_prefix)
+ err = __mlx4_ib_query_gid(&dev->ib_dev, port, 0, &gid, 1);
+ else
+ gid.global.subnet_prefix =
+ eqe->event.port_mgmt_change.params.port_info.gid_prefix;
+ if (err) {
+ pr_warn("Could not change QP1 subnet prefix for port %d: query_gid error (%d)\n",
+ port, err);
+ } else {
+ pr_debug("Changing QP1 subnet prefix for port %d. old=0x%llx. new=0x%llx\n",
+ port,
+ (u64)atomic64_read(&dev->sriov.demux[port - 1].subnet_prefix),
+ be64_to_cpu(gid.global.subnet_prefix));
+ atomic64_set(&dev->sriov.demux[port - 1].subnet_prefix,
+ be64_to_cpu(gid.global.subnet_prefix));
+ }
+ }
mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
/*if master, notify all slaves*/
if (mlx4_is_master(dev->dev))
@@ -1326,9 +1353,15 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
ret = ib_post_send(send_qp, &wr.wr, &bad_wr);
+ if (!ret)
+ return 0;
+
+ spin_lock(&sqp->tx_lock);
+ sqp->tx_ix_tail++;
+ spin_unlock(&sqp->tx_lock);
+ sqp->tx_ring[wire_tx_ix].ah = NULL;
out:
- if (ret)
- ib_destroy_ah(ah);
+ ib_destroy_ah(ah);
return ret;
}
@@ -2190,6 +2223,8 @@ int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
if (err)
goto demux_err;
dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id;
+ atomic64_set(&dev->sriov.demux[i].subnet_prefix,
+ be64_to_cpu(gid.global.subnet_prefix));
err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1,
&dev->sriov.sqps[i]);
if (err)
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index f014eaf5969b..87ba9bca4181 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -505,9 +505,9 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
else
props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A;
- if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED)
- props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
}
+ if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED)
+ props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
@@ -1601,7 +1601,7 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att
else if (ret == -ENXIO)
pr_err("Device managed flow steering is disabled. Fail to register network rule.\n");
else if (ret)
- pr_err("Invalid argumant. Fail to register network rule.\n");
+ pr_err("Invalid argument. Fail to register network rule.\n");
mlx4_free_cmd_mailbox(mdev->dev, mailbox);
return ret;
@@ -1704,6 +1704,9 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
struct mlx4_dev *dev = (to_mdev(qp->device))->dev;
int is_bonded = mlx4_is_bonded(dev);
+ if (flow_attr->port < 1 || flow_attr->port > qp->device->phys_port_cnt)
+ return ERR_PTR(-EINVAL);
+
if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) &&
(flow_attr->type != IB_FLOW_ATTR_NORMAL))
return ERR_PTR(-EOPNOTSUPP);
@@ -2022,16 +2025,6 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
}
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
- char *buf)
-{
- struct mlx4_ib_dev *dev =
- container_of(device, struct mlx4_ib_dev, ib_dev.dev);
- return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32),
- (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
- (int) dev->dev->caps.fw_ver & 0xffff);
-}
-
static ssize_t show_rev(struct device *device, struct device_attribute *attr,
char *buf)
{
@@ -2050,17 +2043,207 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
}
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
static struct device_attribute *mlx4_class_attributes[] = {
&dev_attr_hw_rev,
- &dev_attr_fw_ver,
&dev_attr_hca_type,
&dev_attr_board_id
};
+struct diag_counter {
+ const char *name;
+ u32 offset;
+};
+
+#define DIAG_COUNTER(_name, _offset) \
+ { .name = #_name, .offset = _offset }
+
+static const struct diag_counter diag_basic[] = {
+ DIAG_COUNTER(rq_num_lle, 0x00),
+ DIAG_COUNTER(sq_num_lle, 0x04),
+ DIAG_COUNTER(rq_num_lqpoe, 0x08),
+ DIAG_COUNTER(sq_num_lqpoe, 0x0C),
+ DIAG_COUNTER(rq_num_lpe, 0x18),
+ DIAG_COUNTER(sq_num_lpe, 0x1C),
+ DIAG_COUNTER(rq_num_wrfe, 0x20),
+ DIAG_COUNTER(sq_num_wrfe, 0x24),
+ DIAG_COUNTER(sq_num_mwbe, 0x2C),
+ DIAG_COUNTER(sq_num_bre, 0x34),
+ DIAG_COUNTER(sq_num_rire, 0x44),
+ DIAG_COUNTER(rq_num_rire, 0x48),
+ DIAG_COUNTER(sq_num_rae, 0x4C),
+ DIAG_COUNTER(rq_num_rae, 0x50),
+ DIAG_COUNTER(sq_num_roe, 0x54),
+ DIAG_COUNTER(sq_num_tree, 0x5C),
+ DIAG_COUNTER(sq_num_rree, 0x64),
+ DIAG_COUNTER(rq_num_rnr, 0x68),
+ DIAG_COUNTER(sq_num_rnr, 0x6C),
+ DIAG_COUNTER(rq_num_oos, 0x100),
+ DIAG_COUNTER(sq_num_oos, 0x104),
+};
+
+static const struct diag_counter diag_ext[] = {
+ DIAG_COUNTER(rq_num_dup, 0x130),
+ DIAG_COUNTER(sq_num_to, 0x134),
+};
+
+static const struct diag_counter diag_device_only[] = {
+ DIAG_COUNTER(num_cqovf, 0x1A0),
+ DIAG_COUNTER(rq_num_udsdprd, 0x118),
+};
+
+static struct rdma_hw_stats *mlx4_ib_alloc_hw_stats(struct ib_device *ibdev,
+ u8 port_num)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+
+ if (!diag[!!port_num].name)
+ return NULL;
+
+ return rdma_alloc_hw_stats_struct(diag[!!port_num].name,
+ diag[!!port_num].num_counters,
+ RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx4_ib_get_hw_stats(struct ib_device *ibdev,
+ struct rdma_hw_stats *stats,
+ u8 port, int index)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_diag_counters *diag = dev->diag_counters;
+ u32 hw_value[ARRAY_SIZE(diag_device_only) +
+ ARRAY_SIZE(diag_ext) + ARRAY_SIZE(diag_basic)] = {};
+ int ret;
+ int i;
+
+ ret = mlx4_query_diag_counters(dev->dev,
+ MLX4_OP_MOD_QUERY_TRANSPORT_CI_ERRORS,
+ diag[!!port].offset, hw_value,
+ diag[!!port].num_counters, port);
+
+ if (ret)
+ return ret;
+
+ for (i = 0; i < diag[!!port].num_counters; i++)
+ stats->value[i] = hw_value[i];
+
+ return diag[!!port].num_counters;
+}
+
+static int __mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev,
+ const char ***name,
+ u32 **offset,
+ u32 *num,
+ bool port)
+{
+ u32 num_counters;
+
+ num_counters = ARRAY_SIZE(diag_basic);
+
+ if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT)
+ num_counters += ARRAY_SIZE(diag_ext);
+
+ if (!port)
+ num_counters += ARRAY_SIZE(diag_device_only);
+
+ *name = kcalloc(num_counters, sizeof(**name), GFP_KERNEL);
+ if (!*name)
+ return -ENOMEM;
+
+ *offset = kcalloc(num_counters, sizeof(**offset), GFP_KERNEL);
+ if (!*offset)
+ goto err_name;
+
+ *num = num_counters;
+
+ return 0;
+
+err_name:
+ kfree(*name);
+ return -ENOMEM;
+}
+
+static void mlx4_ib_fill_diag_counters(struct mlx4_ib_dev *ibdev,
+ const char **name,
+ u32 *offset,
+ bool port)
+{
+ int i;
+ int j;
+
+ for (i = 0, j = 0; i < ARRAY_SIZE(diag_basic); i++, j++) {
+ name[i] = diag_basic[i].name;
+ offset[i] = diag_basic[i].offset;
+ }
+
+ if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT) {
+ for (i = 0; i < ARRAY_SIZE(diag_ext); i++, j++) {
+ name[j] = diag_ext[i].name;
+ offset[j] = diag_ext[i].offset;
+ }
+ }
+
+ if (!port) {
+ for (i = 0; i < ARRAY_SIZE(diag_device_only); i++, j++) {
+ name[j] = diag_device_only[i].name;
+ offset[j] = diag_device_only[i].offset;
+ }
+ }
+}
+
+static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev)
+{
+ struct mlx4_ib_diag_counters *diag = ibdev->diag_counters;
+ int i;
+ int ret;
+ bool per_port = !!(ibdev->dev->caps.flags2 &
+ MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT);
+
+ if (mlx4_is_slave(ibdev->dev))
+ return 0;
+
+ for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+ /* i == 1 means we are building port counters */
+ if (i && !per_port)
+ continue;
+
+ ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].name,
+ &diag[i].offset,
+ &diag[i].num_counters, i);
+ if (ret)
+ goto err_alloc;
+
+ mlx4_ib_fill_diag_counters(ibdev, diag[i].name,
+ diag[i].offset, i);
+ }
+
+ ibdev->ib_dev.get_hw_stats = mlx4_ib_get_hw_stats;
+ ibdev->ib_dev.alloc_hw_stats = mlx4_ib_alloc_hw_stats;
+
+ return 0;
+
+err_alloc:
+ if (i) {
+ kfree(diag[i - 1].name);
+ kfree(diag[i - 1].offset);
+ }
+
+ return ret;
+}
+
+static void mlx4_ib_diag_cleanup(struct mlx4_ib_dev *ibdev)
+{
+ int i;
+
+ for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
+ kfree(ibdev->diag_counters[i].offset);
+ kfree(ibdev->diag_counters[i].name);
+ }
+}
+
#define MLX4_IB_INVALID_MAC ((u64)-1)
static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
struct net_device *dev,
@@ -2277,6 +2460,17 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
return 0;
}
+static void get_fw_ver_str(struct ib_device *device, char *str,
+ size_t str_len)
+{
+ struct mlx4_ib_dev *dev =
+ container_of(device, struct mlx4_ib_dev, ib_dev);
+ snprintf(str, str_len, "%d.%d.%d",
+ (int) (dev->dev->caps.fw_ver >> 32),
+ (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+ (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
static void *mlx4_ib_add(struct mlx4_dev *dev)
{
struct mlx4_ib_dev *ibdev;
@@ -2410,6 +2604,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach;
ibdev->ib_dev.process_mad = mlx4_ib_process_mad;
ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
+ ibdev->ib_dev.get_dev_fw_str = get_fw_ver_str;
ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
if (!mlx4_is_slave(ibdev->dev)) {
@@ -2552,9 +2747,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
- if (ib_register_device(&ibdev->ib_dev, NULL))
+ if (mlx4_ib_alloc_diag_counters(ibdev))
goto err_steer_free_bitmap;
+ if (ib_register_device(&ibdev->ib_dev, NULL))
+ goto err_diag_counters;
+
if (mlx4_ib_mad_init(ibdev))
goto err_reg;
@@ -2620,6 +2818,9 @@ err_mad:
err_reg:
ib_unregister_device(&ibdev->ib_dev);
+err_diag_counters:
+ mlx4_ib_diag_cleanup(ibdev);
+
err_steer_free_bitmap:
kfree(ibdev->ib_uc_qpns_bitmap);
@@ -2723,6 +2924,7 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
mlx4_ib_close_sriov(ibdev);
mlx4_ib_mad_cleanup(ibdev);
ib_unregister_device(&ibdev->ib_dev);
+ mlx4_ib_diag_cleanup(ibdev);
if (ibdev->iboe.nb.notifier_call) {
if (unregister_netdevice_notifier(&ibdev->iboe.nb))
pr_warn("failure unregistering notifier\n");
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index 99451d887266..097bfcc4ee99 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -96,7 +96,7 @@ struct ib_sa_mcmember_data {
u8 scope_join_state;
u8 proxy_join;
u8 reserved[2];
-};
+} __packed __aligned(4);
struct mcast_group {
struct ib_sa_mcmember_data rec;
@@ -489,7 +489,7 @@ static u8 get_leave_state(struct mcast_group *group)
if (!group->members[i])
leave_state |= (1 << i);
- return leave_state & (group->rec.scope_join_state & 7);
+ return leave_state & (group->rec.scope_join_state & 0xf);
}
static int join_group(struct mcast_group *group, int slave, u8 join_mask)
@@ -564,8 +564,8 @@ static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
} else
mcg_warn_group(group, "DRIVER BUG\n");
} else if (group->state == MCAST_LEAVE_SENT) {
- if (group->rec.scope_join_state & 7)
- group->rec.scope_join_state &= 0xf8;
+ if (group->rec.scope_join_state & 0xf)
+ group->rec.scope_join_state &= 0xf0;
group->state = MCAST_IDLE;
mutex_unlock(&group->lock);
if (release_group(group, 1))
@@ -605,7 +605,7 @@ static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
static int handle_join_req(struct mcast_group *group, u8 join_mask,
struct mcast_req *req)
{
- u8 group_join_state = group->rec.scope_join_state & 7;
+ u8 group_join_state = group->rec.scope_join_state & 0xf;
int ref = 0;
u16 status;
struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
@@ -690,8 +690,8 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work)
u8 cur_join_state;
resp_join_state = ((struct ib_sa_mcmember_data *)
- group->response_sa_mad.data)->scope_join_state & 7;
- cur_join_state = group->rec.scope_join_state & 7;
+ group->response_sa_mad.data)->scope_join_state & 0xf;
+ cur_join_state = group->rec.scope_join_state & 0xf;
if (method == IB_MGMT_METHOD_GET_RESP) {
/* successfull join */
@@ -710,7 +710,7 @@ process_requests:
req = list_first_entry(&group->pending_list, struct mcast_req,
group_list);
sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
- req_join_state = sa_data->scope_join_state & 0x7;
+ req_join_state = sa_data->scope_join_state & 0xf;
/* For a leave request, we will immediately answer the VF, and
* update our internal counters. The actual leave will be sent
@@ -747,14 +747,11 @@ static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx
__be64 tid,
union ib_gid *new_mgid)
{
- struct mcast_group *group = NULL, *cur_group;
+ struct mcast_group *group = NULL, *cur_group, *n;
struct mcast_req *req;
- struct list_head *pos;
- struct list_head *n;
mutex_lock(&ctx->mcg_table_lock);
- list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) {
- group = list_entry(pos, struct mcast_group, mgid0_list);
+ list_for_each_entry_safe(group, n, &ctx->mcg_mgid0_list, mgid0_list) {
mutex_lock(&group->lock);
if (group->last_req_tid == tid) {
if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 1eca01cebe51..686ab48ff644 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -139,7 +139,7 @@ struct mlx4_ib_mr {
u32 max_pages;
struct mlx4_mr mmr;
struct ib_umem *umem;
- void *pages_alloc;
+ size_t page_map_size;
};
struct mlx4_ib_mw {
@@ -448,7 +448,7 @@ struct mlx4_ib_demux_ctx {
struct workqueue_struct *wq;
struct workqueue_struct *ud_wq;
spinlock_t ud_lock;
- __be64 subnet_prefix;
+ atomic64_t subnet_prefix;
__be64 guid_cache[128];
struct mlx4_ib_dev *dev;
/* the following lock protects both mcg_table and mcg_mgid0_list */
@@ -549,6 +549,14 @@ struct mlx4_ib_counters {
u32 default_counter;
};
+#define MLX4_DIAG_COUNTERS_TYPES 2
+
+struct mlx4_ib_diag_counters {
+ const char **name;
+ u32 *offset;
+ u32 num_counters;
+};
+
struct mlx4_ib_dev {
struct ib_device ib_dev;
struct mlx4_dev *dev;
@@ -585,6 +593,7 @@ struct mlx4_ib_dev {
/* protect resources needed as part of reset flow */
spinlock_t reset_flow_resource_lock;
struct list_head qp_list;
+ struct mlx4_ib_diag_counters diag_counters[MLX4_DIAG_COUNTERS_TYPES];
};
struct ib_event_work {
@@ -717,9 +726,8 @@ int mlx4_ib_dealloc_mw(struct ib_mw *mw);
struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
enum ib_mr_type mr_type,
u32 max_num_sg);
-int mlx4_ib_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents);
+int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset);
int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index ce0b5aa8eb9b..5d73989d9771 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -277,20 +277,23 @@ mlx4_alloc_priv_pages(struct ib_device *device,
struct mlx4_ib_mr *mr,
int max_pages)
{
- int size = max_pages * sizeof(u64);
- int add_size;
int ret;
- add_size = max_t(int, MLX4_MR_PAGES_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
+ /* Ensure that size is aligned to DMA cacheline
+ * requirements.
+ * max_pages is limited to MLX4_MAX_FAST_REG_PAGES
+ * so page_map_size will never cross PAGE_SIZE.
+ */
+ mr->page_map_size = roundup(max_pages * sizeof(u64),
+ MLX4_MR_PAGES_ALIGN);
- mr->pages_alloc = kzalloc(size + add_size, GFP_KERNEL);
- if (!mr->pages_alloc)
+ /* Prevent cross page boundary allocation. */
+ mr->pages = (__be64 *)get_zeroed_page(GFP_KERNEL);
+ if (!mr->pages)
return -ENOMEM;
- mr->pages = PTR_ALIGN(mr->pages_alloc, MLX4_MR_PAGES_ALIGN);
-
mr->page_map = dma_map_single(device->dma_device, mr->pages,
- size, DMA_TO_DEVICE);
+ mr->page_map_size, DMA_TO_DEVICE);
if (dma_mapping_error(device->dma_device, mr->page_map)) {
ret = -ENOMEM;
@@ -298,9 +301,9 @@ mlx4_alloc_priv_pages(struct ib_device *device,
}
return 0;
-err:
- kfree(mr->pages_alloc);
+err:
+ free_page((unsigned long)mr->pages);
return ret;
}
@@ -309,11 +312,10 @@ mlx4_free_priv_pages(struct mlx4_ib_mr *mr)
{
if (mr->pages) {
struct ib_device *device = mr->ibmr.device;
- int size = mr->max_pages * sizeof(u64);
dma_unmap_single(device->dma_device, mr->page_map,
- size, DMA_TO_DEVICE);
- kfree(mr->pages_alloc);
+ mr->page_map_size, DMA_TO_DEVICE);
+ free_page((unsigned long)mr->pages);
mr->pages = NULL;
}
}
@@ -528,9 +530,8 @@ static int mlx4_set_page(struct ib_mr *ibmr, u64 addr)
return 0;
}
-int mlx4_ib_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents)
+int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset)
{
struct mlx4_ib_mr *mr = to_mmr(ibmr);
int rc;
@@ -538,14 +539,12 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr,
mr->npages = 0;
ib_dma_sync_single_for_cpu(ibmr->device, mr->page_map,
- sizeof(u64) * mr->max_pages,
- DMA_TO_DEVICE);
+ mr->page_map_size, DMA_TO_DEVICE);
- rc = ib_sg_to_pages(ibmr, sg, sg_nents, mlx4_set_page);
+ rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx4_set_page);
ib_dma_sync_single_for_device(ibmr->device, mr->page_map,
- sizeof(u64) * mr->max_pages,
- DMA_TO_DEVICE);
+ mr->page_map_size, DMA_TO_DEVICE);
return rc;
}
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index fd97534762b8..7fb9629bd12b 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -232,7 +232,7 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
}
} else {
ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
- s = (ctrl->fence_size & 0x3f) << 4;
+ s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4;
for (i = 64; i < s; i += 64) {
wqe = buf + i;
*wqe = cpu_to_be32(0xffffffff);
@@ -264,7 +264,7 @@ static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
}
ctrl->srcrb_flags = 0;
- ctrl->fence_size = size / 16;
+ ctrl->qpn_vlan.fence_size = size / 16;
/*
* Make sure descriptor is fully written before setting ownership bit
* (because HW can start executing as soon as we do).
@@ -362,7 +362,7 @@ static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
sizeof (struct mlx4_wqe_raddr_seg);
case MLX4_IB_QPT_RC:
return sizeof (struct mlx4_wqe_ctrl_seg) +
- sizeof (struct mlx4_wqe_atomic_seg) +
+ sizeof (struct mlx4_wqe_masked_atomic_seg) +
sizeof (struct mlx4_wqe_raddr_seg);
case MLX4_IB_QPT_SMI:
case MLX4_IB_QPT_GSI:
@@ -419,7 +419,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
}
static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
- enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
+ enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp,
+ bool shrink_wqe)
{
int s;
@@ -477,7 +478,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
* We set WQE size to at least 64 bytes, this way stamping
* invalidates each WQE.
*/
- if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+ if (shrink_wqe && dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
qp->sq_signal_bits && BITS_PER_LONG == 64 &&
type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
!(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
@@ -642,6 +643,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
{
int qpn;
int err;
+ struct ib_qp_cap backup_cap;
struct mlx4_ib_sqp *sqp;
struct mlx4_ib_qp *qp;
enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
@@ -775,7 +777,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
goto err;
}
- err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+ memcpy(&backup_cap, &init_attr->cap, sizeof(backup_cap));
+ err = set_kernel_sq_size(dev, &init_attr->cap,
+ qp_type, qp, true);
if (err)
goto err;
@@ -787,9 +791,20 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
*qp->db.db = 0;
}
- if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) {
- err = -ENOMEM;
- goto err_db;
+ if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size,
+ &qp->buf, gfp)) {
+ memcpy(&init_attr->cap, &backup_cap,
+ sizeof(backup_cap));
+ err = set_kernel_sq_size(dev, &init_attr->cap, qp_type,
+ qp, false);
+ if (err)
+ goto err_db;
+
+ if (mlx4_buf_alloc(dev->dev, qp->buf_size,
+ PAGE_SIZE * 2, &qp->buf, gfp)) {
+ err = -ENOMEM;
+ goto err_db;
+ }
}
err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
@@ -1176,8 +1191,10 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
{
err = create_qp_common(to_mdev(pd->device), pd, init_attr,
udata, 0, &qp, gfp);
- if (err)
+ if (err) {
+ kfree(qp);
return ERR_PTR(err);
+ }
qp->ibqp.qp_num = qp->mqp.qpn;
qp->xrcdn = xrcdn;
@@ -1975,7 +1992,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
ctrl = get_send_wqe(qp, i);
ctrl->owner_opcode = cpu_to_be32(1 << 31);
if (qp->sq_max_wqes_per_wr == 1)
- ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+ ctrl->qpn_vlan.fence_size =
+ 1 << (qp->sq.wqe_shift - 4);
stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
}
@@ -2475,24 +2493,27 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
sqp->ud_header.grh.flow_label =
ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit;
- if (is_eth)
+ if (is_eth) {
memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
- else {
- if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
- /* When multi-function is enabled, the ib_core gid
- * indexes don't necessarily match the hw ones, so
- * we must use our own cache */
- sqp->ud_header.grh.source_gid.global.subnet_prefix =
- to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
- subnet_prefix;
- sqp->ud_header.grh.source_gid.global.interface_id =
- to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
- guid_cache[ah->av.ib.gid_index];
- } else
- ib_get_cached_gid(ib_dev,
- be32_to_cpu(ah->av.ib.port_pd) >> 24,
- ah->av.ib.gid_index,
- &sqp->ud_header.grh.source_gid, NULL);
+ } else {
+ if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+ /* When multi-function is enabled, the ib_core gid
+ * indexes don't necessarily match the hw ones, so
+ * we must use our own cache
+ */
+ sqp->ud_header.grh.source_gid.global.subnet_prefix =
+ cpu_to_be64(atomic64_read(&(to_mdev(ib_dev)->sriov.
+ demux[sqp->qp.port - 1].
+ subnet_prefix)));
+ sqp->ud_header.grh.source_gid.global.interface_id =
+ to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+ guid_cache[ah->av.ib.gid_index];
+ } else {
+ ib_get_cached_gid(ib_dev,
+ be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index,
+ &sqp->ud_header.grh.source_gid, NULL);
+ }
}
memcpy(sqp->ud_header.grh.destination_gid.raw,
ah->av.ib.dgid, 16);
@@ -3152,8 +3173,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
wmb();
*lso_wqe = lso_hdr_sz;
- ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
- MLX4_WQE_CTRL_FENCE : 0) | size;
+ ctrl->qpn_vlan.fence_size = (wr->send_flags & IB_SEND_FENCE ?
+ MLX4_WQE_CTRL_FENCE : 0) | size;
/*
* Make sure descriptor is fully written before
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index a00ba4418de9..e4fac9292e4a 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -424,6 +424,83 @@ static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
item->key = be32_to_cpu(cqe->mkey);
}
+static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries,
+ struct ib_wc *wc, int *npolled)
+{
+ struct mlx5_ib_wq *wq;
+ unsigned int cur;
+ unsigned int idx;
+ int np;
+ int i;
+
+ wq = &qp->sq;
+ cur = wq->head - wq->tail;
+ np = *npolled;
+
+ if (cur == 0)
+ return;
+
+ for (i = 0; i < cur && np < num_entries; i++) {
+ idx = wq->last_poll & (wq->wqe_cnt - 1);
+ wc->wr_id = wq->wrid[idx];
+ wc->status = IB_WC_WR_FLUSH_ERR;
+ wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+ wq->tail++;
+ np++;
+ wc->qp = &qp->ibqp;
+ wc++;
+ wq->last_poll = wq->w_list[idx].next;
+ }
+ *npolled = np;
+}
+
+static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries,
+ struct ib_wc *wc, int *npolled)
+{
+ struct mlx5_ib_wq *wq;
+ unsigned int cur;
+ int np;
+ int i;
+
+ wq = &qp->rq;
+ cur = wq->head - wq->tail;
+ np = *npolled;
+
+ if (cur == 0)
+ return;
+
+ for (i = 0; i < cur && np < num_entries; i++) {
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ wc->status = IB_WC_WR_FLUSH_ERR;
+ wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+ wq->tail++;
+ np++;
+ wc->qp = &qp->ibqp;
+ wc++;
+ }
+ *npolled = np;
+}
+
+static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries,
+ struct ib_wc *wc, int *npolled)
+{
+ struct mlx5_ib_qp *qp;
+
+ *npolled = 0;
+ /* Find uncompleted WQEs belonging to that cq and retrun mmics ones */
+ list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) {
+ sw_send_comp(qp, num_entries, wc + *npolled, npolled);
+ if (*npolled >= num_entries)
+ return;
+ }
+
+ list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) {
+ sw_recv_comp(qp, num_entries, wc + *npolled, npolled);
+ if (*npolled >= num_entries)
+ return;
+ }
+}
+
static int mlx5_poll_one(struct mlx5_ib_cq *cq,
struct mlx5_ib_qp **cur_qp,
struct ib_wc *wc)
@@ -476,12 +553,6 @@ repoll:
* from the table.
*/
mqp = __mlx5_qp_lookup(dev->mdev, qpn);
- if (unlikely(!mqp)) {
- mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n",
- cq->mcq.cqn, qpn);
- return -EINVAL;
- }
-
*cur_qp = to_mibqp(mqp);
}
@@ -542,13 +613,6 @@ repoll:
read_lock(&dev->mdev->priv.mkey_table.lock);
mmkey = __mlx5_mr_lookup(dev->mdev,
mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
- if (unlikely(!mmkey)) {
- read_unlock(&dev->mdev->priv.mkey_table.lock);
- mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n",
- cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey));
- return -EINVAL;
- }
-
mr = to_mibmr(mmkey);
get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
mr->sig->sig_err_exists = true;
@@ -594,31 +658,32 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
{
struct mlx5_ib_cq *cq = to_mcq(ibcq);
struct mlx5_ib_qp *cur_qp = NULL;
+ struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+ struct mlx5_core_dev *mdev = dev->mdev;
unsigned long flags;
int soft_polled = 0;
int npolled;
- int err = 0;
spin_lock_irqsave(&cq->lock, flags);
+ if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+ mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+ goto out;
+ }
if (unlikely(!list_empty(&cq->wc_list)))
soft_polled = poll_soft_wc(cq, num_entries, wc);
for (npolled = 0; npolled < num_entries - soft_polled; npolled++) {
- err = mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled);
- if (err)
+ if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled))
break;
}
if (npolled)
mlx5_cq_set_ci(&cq->mcq);
-
+out:
spin_unlock_irqrestore(&cq->lock, flags);
- if (err == 0 || err == -EAGAIN)
- return soft_polled + npolled;
- else
- return err;
+ return soft_polled + npolled;
}
int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
@@ -822,7 +887,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
int eqn;
int err;
- if (entries < 0)
+ if (entries < 0 ||
+ (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))))
return ERR_PTR(-EINVAL);
if (check_cq_create_flags(attr->flags))
@@ -842,6 +908,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
cq->resize_buf = NULL;
cq->resize_umem = NULL;
cq->create_flags = attr->flags;
+ INIT_LIST_HEAD(&cq->list_send_qp);
+ INIT_LIST_HEAD(&cq->list_recv_qp);
if (context) {
err = create_cq_user(dev, udata, context, cq, entries,
@@ -879,7 +947,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
cq->mcq.irqn = irqn;
- cq->mcq.comp = mlx5_ib_cq_comp;
+ if (context)
+ cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
+ else
+ cq->mcq.comp = mlx5_ib_cq_comp;
cq->mcq.event = mlx5_ib_cq_event;
INIT_LIST_HEAD(&cq->wc_list);
@@ -1165,11 +1236,16 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
return -ENOSYS;
}
- if (entries < 1)
+ if (entries < 1 ||
+ entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) {
+ mlx5_ib_warn(dev, "wrong entries number %d, max %d\n",
+ entries,
+ 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz));
return -EINVAL;
+ }
entries = roundup_pow_of_two(entries + 1);
- if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1)
+ if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1)
return -EINVAL;
if (entries == ibcq->cqe + 1)
diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c
index 53e03c8ede79..79e6309460dc 100644
--- a/drivers/infiniband/hw/mlx5/gsi.c
+++ b/drivers/infiniband/hw/mlx5/gsi.c
@@ -69,15 +69,6 @@ static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev)
return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn);
}
-static u32 next_outstanding(struct mlx5_ib_gsi_qp *gsi, u32 index)
-{
- return ++index % gsi->cap.max_send_wr;
-}
-
-#define for_each_outstanding_wr(gsi, index) \
- for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; \
- index = next_outstanding(gsi, index))
-
/* Call with gsi->lock locked */
static void generate_completions(struct mlx5_ib_gsi_qp *gsi)
{
@@ -85,8 +76,9 @@ static void generate_completions(struct mlx5_ib_gsi_qp *gsi)
struct mlx5_ib_gsi_wr *wr;
u32 index;
- for_each_outstanding_wr(gsi, index) {
- wr = &gsi->outstanding_wrs[index];
+ for (index = gsi->outstanding_ci; index != gsi->outstanding_pi;
+ index++) {
+ wr = &gsi->outstanding_wrs[index % gsi->cap.max_send_wr];
if (!wr->completed)
break;
@@ -430,8 +422,9 @@ static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi,
return -ENOMEM;
}
- gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi];
- gsi->outstanding_pi = next_outstanding(gsi, gsi->outstanding_pi);
+ gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi %
+ gsi->cap.max_send_wr];
+ gsi->outstanding_pi++;
if (!wc) {
memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc));
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 1534af113058..364aab9f3c9e 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -121,7 +121,7 @@ static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext,
pma_cnt_ext->port_xmit_data =
cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.octets,
transmitted_ib_multicast.octets) >> 2);
- pma_cnt_ext->port_xmit_data =
+ pma_cnt_ext->port_rcv_data =
cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.octets,
received_ib_multicast.octets) >> 2);
pma_cnt_ext->port_xmit_packets =
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 6ad0489cb3c5..e19537cf44ab 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -37,13 +37,17 @@
#include <linux/pci.h>
#include <linux/dma-mapping.h>
#include <linux/slab.h>
-#include <linux/io-mapping.h>
+#if defined(CONFIG_X86)
+#include <asm/pat.h>
+#endif
#include <linux/sched.h>
+#include <linux/delay.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include <linux/mlx5/port.h>
#include <linux/mlx5/vport.h>
+#include <linux/list.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_umem.h>
#include <linux/in.h>
@@ -284,7 +288,9 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
{
- return !MLX5_CAP_GEN(dev->mdev, ib_virt);
+ if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
+ return !MLX5_CAP_GEN(dev->mdev, ib_virt);
+ return 0;
}
enum {
@@ -454,8 +460,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
int max_rq_sg;
int max_sq_sg;
u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
+ struct mlx5_ib_query_device_resp resp = {};
+ size_t resp_len;
+ u64 max_tso;
+
+ resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
+ if (uhw->outlen && uhw->outlen < resp_len)
+ return -EINVAL;
+ else
+ resp.response_length = resp_len;
- if (uhw->inlen || uhw->outlen)
+ if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
return -EINVAL;
memset(props, 0, sizeof(*props));
@@ -508,15 +523,33 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
if (MLX5_CAP_GEN(mdev, block_lb_mc))
props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
- if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
- (MLX5_CAP_ETH(dev->mdev, csum_cap)))
+ if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
+ if (MLX5_CAP_ETH(mdev, csum_cap))
props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
+ if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
+ max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
+ if (max_tso) {
+ resp.tso_caps.max_tso = 1 << max_tso;
+ resp.tso_caps.supported_qpts |=
+ 1 << IB_QPT_RAW_PACKET;
+ resp.response_length += sizeof(resp.tso_caps);
+ }
+ }
+ }
+
if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
props->device_cap_flags |= IB_DEVICE_UD_TSO;
}
+ if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+ MLX5_CAP_ETH(dev->mdev, scatter_fcs))
+ props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
+
+ if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
+ props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
+
props->vendor_part_id = mdev->pdev->device;
props->hw_ver = mdev->pdev->revision;
@@ -566,6 +599,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
if (!mlx5_core_is_pf(mdev))
props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
+ if (uhw->outlen) {
+ err = ib_copy_to_udata(uhw, &resp, resp.response_length);
+
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -908,7 +948,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
- resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
+ if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
+ resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
resp.cache_line_size = L1_CACHE_BYTES;
resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
@@ -972,6 +1013,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
goto out_uars;
}
+ INIT_LIST_HEAD(&context->vma_private_list);
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
@@ -981,15 +1023,26 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
if (field_avail(typeof(resp), cqe_version, udata->outlen))
resp.response_length += sizeof(resp.cqe_version);
- if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
+ if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
+ resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE;
+ resp.response_length += sizeof(resp.cmds_supp_uhw);
+ }
+
+ /*
+ * We don't want to expose information from the PCI bar that is located
+ * after 4096 bytes, so if the arch only supports larger pages, let's
+ * pretend we don't support reading the HCA's core clock. This is also
+ * forced by mmap function.
+ */
+ if (PAGE_SIZE <= 4096 &&
+ field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
resp.comp_mask |=
MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
resp.hca_core_clock_offset =
offsetof(struct mlx5_init_seg, internal_timer_h) %
PAGE_SIZE;
resp.response_length += sizeof(resp.hca_core_clock_offset) +
- sizeof(resp.reserved2) +
- sizeof(resp.reserved3);
+ sizeof(resp.reserved2);
}
err = ib_copy_to_udata(udata, &resp, resp.response_length);
@@ -1068,38 +1121,209 @@ static int get_index(unsigned long offset)
return get_arg(offset);
}
+static void mlx5_ib_vma_open(struct vm_area_struct *area)
+{
+ /* vma_open is called when a new VMA is created on top of our VMA. This
+ * is done through either mremap flow or split_vma (usually due to
+ * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
+ * as this VMA is strongly hardware related. Therefore we set the
+ * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
+ * calling us again and trying to do incorrect actions. We assume that
+ * the original VMA size is exactly a single page, and therefore all
+ * "splitting" operation will not happen to it.
+ */
+ area->vm_ops = NULL;
+}
+
+static void mlx5_ib_vma_close(struct vm_area_struct *area)
+{
+ struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
+
+ /* It's guaranteed that all VMAs opened on a FD are closed before the
+ * file itself is closed, therefore no sync is needed with the regular
+ * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
+ * However need a sync with accessing the vma as part of
+ * mlx5_ib_disassociate_ucontext.
+ * The close operation is usually called under mm->mmap_sem except when
+ * process is exiting.
+ * The exiting case is handled explicitly as part of
+ * mlx5_ib_disassociate_ucontext.
+ */
+ mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
+
+ /* setting the vma context pointer to null in the mlx5_ib driver's
+ * private data, to protect a race condition in
+ * mlx5_ib_disassociate_ucontext().
+ */
+ mlx5_ib_vma_priv_data->vma = NULL;
+ list_del(&mlx5_ib_vma_priv_data->list);
+ kfree(mlx5_ib_vma_priv_data);
+}
+
+static const struct vm_operations_struct mlx5_ib_vm_ops = {
+ .open = mlx5_ib_vma_open,
+ .close = mlx5_ib_vma_close
+};
+
+static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
+ struct mlx5_ib_ucontext *ctx)
+{
+ struct mlx5_ib_vma_private_data *vma_prv;
+ struct list_head *vma_head = &ctx->vma_private_list;
+
+ vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
+ if (!vma_prv)
+ return -ENOMEM;
+
+ vma_prv->vma = vma;
+ vma->vm_private_data = vma_prv;
+ vma->vm_ops = &mlx5_ib_vm_ops;
+
+ list_add(&vma_prv->list, vma_head);
+
+ return 0;
+}
+
+static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+ int ret;
+ struct vm_area_struct *vma;
+ struct mlx5_ib_vma_private_data *vma_private, *n;
+ struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+
+ owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+ if (!owning_process)
+ return;
+
+ owning_mm = get_task_mm(owning_process);
+ if (!owning_mm) {
+ pr_info("no mm, disassociate ucontext is pending task termination\n");
+ while (1) {
+ put_task_struct(owning_process);
+ usleep_range(1000, 2000);
+ owning_process = get_pid_task(ibcontext->tgid,
+ PIDTYPE_PID);
+ if (!owning_process ||
+ owning_process->state == TASK_DEAD) {
+ pr_info("disassociate ucontext done, task was terminated\n");
+ /* in case task was dead need to release the
+ * task struct.
+ */
+ if (owning_process)
+ put_task_struct(owning_process);
+ return;
+ }
+ }
+ }
+
+ /* need to protect from a race on closing the vma as part of
+ * mlx5_ib_vma_close.
+ */
+ down_read(&owning_mm->mmap_sem);
+ list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
+ list) {
+ vma = vma_private->vma;
+ ret = zap_vma_ptes(vma, vma->vm_start,
+ PAGE_SIZE);
+ WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__);
+ /* context going to be destroyed, should
+ * not access ops any more.
+ */
+ vma->vm_ops = NULL;
+ list_del(&vma_private->list);
+ kfree(vma_private);
+ }
+ up_read(&owning_mm->mmap_sem);
+ mmput(owning_mm);
+ put_task_struct(owning_process);
+}
+
+static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
+{
+ switch (cmd) {
+ case MLX5_IB_MMAP_WC_PAGE:
+ return "WC";
+ case MLX5_IB_MMAP_REGULAR_PAGE:
+ return "best effort WC";
+ case MLX5_IB_MMAP_NC_PAGE:
+ return "NC";
+ default:
+ return NULL;
+ }
+}
+
+static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
+ struct vm_area_struct *vma,
+ struct mlx5_ib_ucontext *context)
+{
+ struct mlx5_uuar_info *uuari = &context->uuari;
+ int err;
+ unsigned long idx;
+ phys_addr_t pfn, pa;
+ pgprot_t prot;
+
+ switch (cmd) {
+ case MLX5_IB_MMAP_WC_PAGE:
+/* Some architectures don't support WC memory */
+#if defined(CONFIG_X86)
+ if (!pat_enabled())
+ return -EPERM;
+#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
+ return -EPERM;
+#endif
+ /* fall through */
+ case MLX5_IB_MMAP_REGULAR_PAGE:
+ /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
+ prot = pgprot_writecombine(vma->vm_page_prot);
+ break;
+ case MLX5_IB_MMAP_NC_PAGE:
+ prot = pgprot_noncached(vma->vm_page_prot);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ idx = get_index(vma->vm_pgoff);
+ if (idx >= uuari->num_uars)
+ return -EINVAL;
+
+ pfn = uar_index2pfn(dev, uuari->uars[idx].index);
+ mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
+
+ vma->vm_page_prot = prot;
+ err = io_remap_pfn_range(vma, vma->vm_start, pfn,
+ PAGE_SIZE, vma->vm_page_prot);
+ if (err) {
+ mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n",
+ err, vma->vm_start, &pfn, mmap_cmd2str(cmd));
+ return -EAGAIN;
+ }
+
+ pa = pfn << PAGE_SHIFT;
+ mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
+ vma->vm_start, &pa);
+
+ return mlx5_ib_set_vma_data(vma, context);
+}
+
static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
{
struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
- struct mlx5_uuar_info *uuari = &context->uuari;
unsigned long command;
- unsigned long idx;
phys_addr_t pfn;
command = get_command(vma->vm_pgoff);
switch (command) {
+ case MLX5_IB_MMAP_WC_PAGE:
+ case MLX5_IB_MMAP_NC_PAGE:
case MLX5_IB_MMAP_REGULAR_PAGE:
- if (vma->vm_end - vma->vm_start != PAGE_SIZE)
- return -EINVAL;
-
- idx = get_index(vma->vm_pgoff);
- if (idx >= uuari->num_uars)
- return -EINVAL;
-
- pfn = uar_index2pfn(dev, uuari->uars[idx].index);
- mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
- (unsigned long long)pfn);
-
- vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
- if (io_remap_pfn_range(vma, vma->vm_start, pfn,
- PAGE_SIZE, vma->vm_page_prot))
- return -EAGAIN;
-
- mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n",
- vma->vm_start,
- (unsigned long long)pfn << PAGE_SHIFT);
- break;
+ return uar_mmap(dev, command, vma, context);
case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
return -ENOSYS;
@@ -1108,7 +1332,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
if (vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EINVAL;
- if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+ if (vma->vm_flags & VM_WRITE)
return -EPERM;
/* Don't expose to user-space information it shouldn't have */
@@ -1206,6 +1430,13 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
dmac_47_16),
ib_spec->eth.val.dst_mac);
+ ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+ smac_47_16),
+ ib_spec->eth.mask.src_mac);
+ ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
+ smac_47_16),
+ ib_spec->eth.val.src_mac);
+
if (ib_spec->eth.mask.vlan_tag) {
MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
vlan_tag, 1);
@@ -1262,6 +1493,32 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
&ib_spec->ipv4.val.dst_ip,
sizeof(ib_spec->ipv4.val.dst_ip));
break;
+ case IB_FLOW_SPEC_IPV6:
+ if (ib_spec->size != sizeof(ib_spec->ipv6))
+ return -EINVAL;
+
+ MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
+ ethertype, 0xffff);
+ MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
+ ethertype, ETH_P_IPV6);
+
+ memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+ src_ipv4_src_ipv6.ipv6_layout.ipv6),
+ &ib_spec->ipv6.mask.src_ip,
+ sizeof(ib_spec->ipv6.mask.src_ip));
+ memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
+ src_ipv4_src_ipv6.ipv6_layout.ipv6),
+ &ib_spec->ipv6.val.src_ip,
+ sizeof(ib_spec->ipv6.val.src_ip));
+ memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+ dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+ &ib_spec->ipv6.mask.dst_ip,
+ sizeof(ib_spec->ipv6.mask.dst_ip));
+ memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
+ dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+ &ib_spec->ipv6.val.dst_ip,
+ sizeof(ib_spec->ipv6.val.dst_ip));
+ break;
case IB_FLOW_SPEC_TCP:
if (ib_spec->size != sizeof(ib_spec->tcp_udp))
return -EINVAL;
@@ -1438,7 +1695,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
if (!ft) {
ft = mlx5_create_auto_grouped_flow_table(ns, priority,
num_entries,
- num_groups);
+ num_groups,
+ 0);
if (!IS_ERR(ft)) {
prio->refcount = 0;
@@ -1458,21 +1716,18 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
{
struct mlx5_flow_table *ft = ft_prio->flow_table;
struct mlx5_ib_flow_handler *handler;
+ struct mlx5_flow_spec *spec;
void *ib_flow = flow_attr + 1;
- u8 match_criteria_enable = 0;
unsigned int spec_index;
- u32 *match_c;
- u32 *match_v;
u32 action;
int err = 0;
if (!is_valid_attr(flow_attr))
return ERR_PTR(-EINVAL);
- match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
- match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+ spec = mlx5_vzalloc(sizeof(*spec));
handler = kzalloc(sizeof(*handler), GFP_KERNEL);
- if (!handler || !match_c || !match_v) {
+ if (!handler || !spec) {
err = -ENOMEM;
goto free;
}
@@ -1480,7 +1735,8 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
INIT_LIST_HEAD(&handler->list);
for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
- err = parse_flow_attr(match_c, match_v, ib_flow);
+ err = parse_flow_attr(spec->match_criteria,
+ spec->match_value, ib_flow);
if (err < 0)
goto free;
@@ -1488,11 +1744,11 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
}
/* Outer header support only */
- match_criteria_enable = (!outer_header_zero(match_c)) << 0;
+ spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria))
+ << 0;
action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
- handler->rule = mlx5_add_flow_rule(ft, match_criteria_enable,
- match_c, match_v,
+ handler->rule = mlx5_add_flow_rule(ft, spec,
action,
MLX5_FS_DEFAULT_FLOW_TAG,
dst);
@@ -1508,8 +1764,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
free:
if (err)
kfree(handler);
- kfree(match_c);
- kfree(match_v);
+ kvfree(spec);
return err ? ERR_PTR(err) : handler;
}
@@ -1603,6 +1858,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
int domain)
{
struct mlx5_ib_dev *dev = to_mdev(qp->device);
+ struct mlx5_ib_qp *mqp = to_mqp(qp);
struct mlx5_ib_flow_handler *handler = NULL;
struct mlx5_flow_destination *dst = NULL;
struct mlx5_ib_flow_prio *ft_prio;
@@ -1629,7 +1885,10 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
}
dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
- dst->tir_num = to_mqp(qp)->raw_packet_qp.rq.tirn;
+ if (mqp->flags & MLX5_IB_QP_RSS)
+ dst->tir_num = mqp->rss_qp.tirn;
+ else
+ dst->tir_num = mqp->raw_packet_qp.rq.tirn;
if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) {
@@ -1734,15 +1993,6 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
}
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
- char *buf)
-{
- struct mlx5_ib_dev *dev =
- container_of(device, struct mlx5_ib_dev, ib_dev.dev);
- return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(dev->mdev),
- fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
-}
-
static ssize_t show_rev(struct device *device, struct device_attribute *attr,
char *buf)
{
@@ -1761,7 +2011,6 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
}
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
@@ -1769,7 +2018,6 @@ static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
static struct device_attribute *mlx5_class_attributes[] = {
&dev_attr_hw_rev,
- &dev_attr_fw_ver,
&dev_attr_hca_type,
&dev_attr_board_id,
&dev_attr_fw_pages,
@@ -1787,6 +2035,65 @@ static void pkey_change_handler(struct work_struct *work)
mutex_unlock(&ports->devr->mutex);
}
+static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
+{
+ struct mlx5_ib_qp *mqp;
+ struct mlx5_ib_cq *send_mcq, *recv_mcq;
+ struct mlx5_core_cq *mcq;
+ struct list_head cq_armed_list;
+ unsigned long flags_qp;
+ unsigned long flags_cq;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&cq_armed_list);
+
+ /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+ spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+ list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+ spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+ if (mqp->sq.tail != mqp->sq.head) {
+ send_mcq = to_mcq(mqp->ibqp.send_cq);
+ spin_lock_irqsave(&send_mcq->lock, flags_cq);
+ if (send_mcq->mcq.comp &&
+ mqp->ibqp.send_cq->comp_handler) {
+ if (!send_mcq->mcq.reset_notify_added) {
+ send_mcq->mcq.reset_notify_added = 1;
+ list_add_tail(&send_mcq->mcq.reset_notify,
+ &cq_armed_list);
+ }
+ }
+ spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+ }
+ spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+ spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+ /* no handling is needed for SRQ */
+ if (!mqp->ibqp.srq) {
+ if (mqp->rq.tail != mqp->rq.head) {
+ recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+ spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+ if (recv_mcq->mcq.comp &&
+ mqp->ibqp.recv_cq->comp_handler) {
+ if (!recv_mcq->mcq.reset_notify_added) {
+ recv_mcq->mcq.reset_notify_added = 1;
+ list_add_tail(&recv_mcq->mcq.reset_notify,
+ &cq_armed_list);
+ }
+ }
+ spin_unlock_irqrestore(&recv_mcq->lock,
+ flags_cq);
+ }
+ }
+ spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+ }
+ /*At that point all inflight post send were put to be executed as of we
+ * lock/unlock above locks Now need to arm all involved CQs.
+ */
+ list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
+ mcq->comp(mcq);
+ }
+ spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+}
+
static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
enum mlx5_dev_event event, unsigned long param)
{
@@ -1799,6 +2106,7 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
case MLX5_DEV_EVENT_SYS_ERROR:
ibdev->ib_active = false;
ibev.event = IB_EVENT_DEVICE_FATAL;
+ mlx5_ib_handle_internal_error(ibdev);
break;
case MLX5_DEV_EVENT_PORT_UP:
@@ -1807,14 +2115,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
break;
case MLX5_DEV_EVENT_PORT_DOWN:
+ case MLX5_DEV_EVENT_PORT_INITIALIZED:
ibev.event = IB_EVENT_PORT_ERR;
port = (u8)param;
break;
- case MLX5_DEV_EVENT_PORT_INITIALIZED:
- /* not used by ULPs */
- return;
-
case MLX5_DEV_EVENT_LID_CHANGE:
ibev.event = IB_EVENT_LID_CHANGE;
port = (u8)param;
@@ -2208,6 +2513,15 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
return 0;
}
+static void get_dev_fw_str(struct ib_device *ibdev, char *str,
+ size_t str_len)
+{
+ struct mlx5_ib_dev *dev =
+ container_of(ibdev, struct mlx5_ib_dev, ib_dev);
+ snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
+ fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
+}
+
static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
{
int err;
@@ -2234,6 +2548,113 @@ static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
unregister_netdevice_notifier(&dev->roce.nb);
}
+static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_ports; i++)
+ mlx5_core_dealloc_q_counter(dev->mdev,
+ dev->port[i].q_cnt_id);
+}
+
+static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
+{
+ int i;
+ int ret;
+
+ for (i = 0; i < dev->num_ports; i++) {
+ ret = mlx5_core_alloc_q_counter(dev->mdev,
+ &dev->port[i].q_cnt_id);
+ if (ret) {
+ mlx5_ib_warn(dev,
+ "couldn't allocate queue counter for port %d, err %d\n",
+ i + 1, ret);
+ goto dealloc_counters;
+ }
+ }
+
+ return 0;
+
+dealloc_counters:
+ while (--i >= 0)
+ mlx5_core_dealloc_q_counter(dev->mdev,
+ dev->port[i].q_cnt_id);
+
+ return ret;
+}
+
+static const char * const names[] = {
+ "rx_write_requests",
+ "rx_read_requests",
+ "rx_atomic_requests",
+ "out_of_buffer",
+ "out_of_sequence",
+ "duplicate_request",
+ "rnr_nak_retry_err",
+ "packet_seq_err",
+ "implied_nak_seq_err",
+ "local_ack_timeout_err",
+};
+
+static const size_t stats_offsets[] = {
+ MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests),
+ MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests),
+ MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests),
+ MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer),
+ MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence),
+ MLX5_BYTE_OFF(query_q_counter_out, duplicate_request),
+ MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err),
+ MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err),
+ MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err),
+ MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err),
+};
+
+static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
+ u8 port_num)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets));
+
+ /* We support only per port stats */
+ if (port_num == 0)
+ return NULL;
+
+ return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names),
+ RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
+ struct rdma_hw_stats *stats,
+ u8 port, int index)
+{
+ struct mlx5_ib_dev *dev = to_mdev(ibdev);
+ int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
+ void *out;
+ __be32 val;
+ int ret;
+ int i;
+
+ if (!port || !stats)
+ return -ENOSYS;
+
+ out = mlx5_vzalloc(outlen);
+ if (!out)
+ return -ENOMEM;
+
+ ret = mlx5_core_query_q_counter(dev->mdev,
+ dev->port[port - 1].q_cnt_id, 0,
+ out, outlen);
+ if (ret)
+ goto free;
+
+ for (i = 0; i < ARRAY_SIZE(names); i++) {
+ val = *(__be32 *)(out + stats_offsets[i]);
+ stats->value[i] = (u64)be32_to_cpu(val);
+ }
+free:
+ kvfree(out);
+ return ARRAY_SIZE(names);
+}
+
static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
{
struct mlx5_ib_dev *dev;
@@ -2256,10 +2677,15 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->mdev = mdev;
+ dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
+ GFP_KERNEL);
+ if (!dev->port)
+ goto err_dealloc;
+
rwlock_init(&dev->roce.netdev_lock);
err = get_port_caps(dev);
if (err)
- goto err_dealloc;
+ goto err_free_port;
if (mlx5_use_mad_ifc(dev))
get_ext_port_caps(dev);
@@ -2354,6 +2780,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg;
dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status;
dev->ib_dev.get_port_immutable = mlx5_port_immutable;
+ dev->ib_dev.get_dev_fw_str = get_dev_fw_str;
if (mlx5_core_is_pf(mdev)) {
dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config;
dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state;
@@ -2361,6 +2788,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid;
}
+ dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
+
mlx5_ib_internal_fill_odp_caps(dev);
if (MLX5_CAP_GEN(mdev, imaicl)) {
@@ -2371,6 +2800,12 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
}
+ if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
+ MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
+ dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats;
+ dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats;
+ }
+
if (MLX5_CAP_GEN(mdev, xrc)) {
dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
@@ -2383,9 +2818,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
IB_LINK_LAYER_ETHERNET) {
dev->ib_dev.create_flow = mlx5_ib_create_flow;
dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
+ dev->ib_dev.create_wq = mlx5_ib_create_wq;
+ dev->ib_dev.modify_wq = mlx5_ib_modify_wq;
+ dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq;
+ dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
+ dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
dev->ib_dev.uverbs_ex_cmd_mask |=
(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
- (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+ (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
+ (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
+ (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
+ (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
+ (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
+ (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
}
err = init_node_data(dev);
if (err)
@@ -2393,6 +2838,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
mutex_init(&dev->flow_db.lock);
mutex_init(&dev->cap_mask_mutex);
+ INIT_LIST_HEAD(&dev->qp_list);
+ spin_lock_init(&dev->reset_flow_resource_lock);
if (ll == IB_LINK_LAYER_ETHERNET) {
err = mlx5_enable_roce(dev);
@@ -2408,10 +2855,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
if (err)
goto err_rsrc;
- err = ib_register_device(&dev->ib_dev, NULL);
+ err = mlx5_ib_alloc_q_counters(dev);
if (err)
goto err_odp;
+ err = ib_register_device(&dev->ib_dev, NULL);
+ if (err)
+ goto err_q_cnt;
+
err = create_umr_res(dev);
if (err)
goto err_dev;
@@ -2433,6 +2884,9 @@ err_umrc:
err_dev:
ib_unregister_device(&dev->ib_dev);
+err_q_cnt:
+ mlx5_ib_dealloc_q_counters(dev);
+
err_odp:
mlx5_ib_odp_remove_one(dev);
@@ -2443,6 +2897,9 @@ err_disable_roce:
if (ll == IB_LINK_LAYER_ETHERNET)
mlx5_disable_roce(dev);
+err_free_port:
+ kfree(dev->port);
+
err_dealloc:
ib_dealloc_device((struct ib_device *)dev);
@@ -2455,11 +2912,13 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
ib_unregister_device(&dev->ib_dev);
+ mlx5_ib_dealloc_q_counters(dev);
destroy_umrc_res(dev);
mlx5_ib_odp_remove_one(dev);
destroy_dev_resources(&dev->devr);
if (ll == IB_LINK_LAYER_ETHERNET)
mlx5_disable_roce(dev);
+ kfree(dev->port);
ib_dealloc_device(&dev->ib_dev);
}
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index 40df2cca0609..996b54e366b0 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -71,7 +71,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
addr = addr >> page_shift;
tmp = (unsigned long)addr;
- m = find_first_bit(&tmp, sizeof(tmp));
+ m = find_first_bit(&tmp, BITS_PER_LONG);
skip = 1 << m;
mask = skip - 1;
i = 0;
@@ -81,7 +81,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
for (k = 0; k < len; k++) {
if (!(i & mask)) {
tmp = (unsigned long)pfn;
- m = min_t(unsigned long, m, find_first_bit(&tmp, sizeof(tmp)));
+ m = min_t(unsigned long, m, find_first_bit(&tmp, BITS_PER_LONG));
skip = 1 << m;
mask = skip - 1;
base = pfn;
@@ -89,7 +89,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
} else {
if (base + p != pfn) {
tmp = (unsigned long)p;
- m = find_first_bit(&tmp, sizeof(tmp));
+ m = find_first_bit(&tmp, BITS_PER_LONG);
skip = 1 << m;
mask = skip - 1;
base = pfn;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index b46c25542a7c..95146f4aa3e3 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -70,6 +70,8 @@ enum {
enum mlx5_ib_mmap_cmd {
MLX5_IB_MMAP_REGULAR_PAGE = 0,
MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1,
+ MLX5_IB_MMAP_WC_PAGE = 2,
+ MLX5_IB_MMAP_NC_PAGE = 3,
/* 5 is chosen in order to be compatible with old versions of libmlx5 */
MLX5_IB_MMAP_CORE_CLOCK = 5,
};
@@ -103,6 +105,11 @@ enum {
MLX5_CQE_VERSION_V1,
};
+struct mlx5_ib_vma_private_data {
+ struct list_head list;
+ struct vm_area_struct *vma;
+};
+
struct mlx5_ib_ucontext {
struct ib_ucontext ibucontext;
struct list_head db_page_list;
@@ -114,6 +121,7 @@ struct mlx5_ib_ucontext {
u8 cqe_version;
/* Transport Domain number */
u32 tdn;
+ struct list_head vma_private_list;
};
static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -215,12 +223,41 @@ struct mlx5_ib_wq {
void *qend;
};
+struct mlx5_ib_rwq {
+ struct ib_wq ibwq;
+ u32 rqn;
+ u32 rq_num_pas;
+ u32 log_rq_stride;
+ u32 log_rq_size;
+ u32 rq_page_offset;
+ u32 log_page_size;
+ struct ib_umem *umem;
+ size_t buf_size;
+ unsigned int page_shift;
+ int create_type;
+ struct mlx5_db db;
+ u32 user_index;
+ u32 wqe_count;
+ u32 wqe_shift;
+ int wq_sig;
+};
+
enum {
MLX5_QP_USER,
MLX5_QP_KERNEL,
MLX5_QP_EMPTY
};
+enum {
+ MLX5_WQ_USER,
+ MLX5_WQ_KERNEL
+};
+
+struct mlx5_ib_rwq_ind_table {
+ struct ib_rwq_ind_table ib_rwq_ind_tbl;
+ u32 rqtn;
+};
+
/*
* Connect-IB can trigger up to four concurrent pagefaults
* per-QP.
@@ -264,6 +301,10 @@ struct mlx5_ib_qp_trans {
u8 resp_depth;
};
+struct mlx5_ib_rss_qp {
+ u32 tirn;
+};
+
struct mlx5_ib_rq {
struct mlx5_ib_qp_base base;
struct mlx5_ib_wq *rq;
@@ -292,6 +333,7 @@ struct mlx5_ib_qp {
union {
struct mlx5_ib_qp_trans trans_qp;
struct mlx5_ib_raw_packet_qp raw_packet_qp;
+ struct mlx5_ib_rss_qp rss_qp;
};
struct mlx5_buf buf;
@@ -338,6 +380,9 @@ struct mlx5_ib_qp {
spinlock_t disable_page_faults_lock;
struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
#endif
+ struct list_head qps_list;
+ struct list_head cq_recv_list;
+ struct list_head cq_send_list;
};
struct mlx5_ib_cq_buf {
@@ -356,6 +401,8 @@ enum mlx5_ib_qp_flags {
MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5,
/* QP uses 1 as its source QP number */
MLX5_IB_QP_SQPN_QP1 = 1 << 6,
+ MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7,
+ MLX5_IB_QP_RSS = 1 << 8,
};
struct mlx5_umr_wr {
@@ -398,6 +445,8 @@ struct mlx5_ib_cq {
struct mlx5_ib_cq_buf *resize_buf;
struct ib_umem *resize_umem;
int cqe_size;
+ struct list_head list_send_qp;
+ struct list_head list_recv_qp;
u32 create_flags;
struct list_head wc_list;
enum ib_cq_notify_flags notify_flags;
@@ -543,6 +592,10 @@ struct mlx5_ib_resources {
struct mutex mutex;
};
+struct mlx5_ib_port {
+ u16 q_cnt_id;
+};
+
struct mlx5_roce {
/* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
* netdev pointer
@@ -578,6 +631,11 @@ struct mlx5_ib_dev {
struct srcu_struct mr_srcu;
#endif
struct mlx5_ib_flow_db flow_db;
+ /* protect resources needed as part of reset flow */
+ spinlock_t reset_flow_resource_lock;
+ struct list_head qp_list;
+ /* Array with num_ports elements */
+ struct mlx5_ib_port *port;
};
static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -625,6 +683,16 @@ static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp)
return container_of(ibqp, struct mlx5_ib_qp, ibqp);
}
+static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq)
+{
+ return container_of(ibwq, struct mlx5_ib_rwq, ibwq);
+}
+
+static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+ return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl);
+}
+
static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq)
{
return container_of(msrq, struct mlx5_ib_srq, msrq);
@@ -712,9 +780,8 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
enum ib_mr_type mr_type,
u32 max_num_sg);
-int mlx5_ib_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents);
+int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset);
int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
const struct ib_wc *in_wc, const struct ib_grh *in_grh,
const struct ib_mad_hdr *in, size_t in_mad_size,
@@ -760,6 +827,16 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift);
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
struct ib_mr_status *mr_status);
+struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx5_ib_destroy_wq(struct ib_wq *wq);
+int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+ u32 wq_attr_mask, struct ib_udata *udata);
+struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
+ struct ib_rwq_ind_table_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
extern struct workqueue_struct *mlx5_ib_page_fault_wq;
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 4d5bff151cdf..4b021305c321 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1193,12 +1193,16 @@ error:
static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
+ struct mlx5_core_dev *mdev = dev->mdev;
struct umr_common *umrc = &dev->umrc;
struct mlx5_ib_umr_context umr_context;
struct mlx5_umr_wr umrwr = {};
struct ib_send_wr *bad;
int err;
+ if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+ return 0;
+
mlx5_ib_init_umr_context(&umr_context);
umrwr.wr.wr_cqe = &umr_context.cqe;
@@ -1751,26 +1755,33 @@ done:
static int
mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
struct scatterlist *sgl,
- unsigned short sg_nents)
+ unsigned short sg_nents,
+ unsigned int *sg_offset_p)
{
struct scatterlist *sg = sgl;
struct mlx5_klm *klms = mr->descs;
+ unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
u32 lkey = mr->ibmr.pd->local_dma_lkey;
int i;
- mr->ibmr.iova = sg_dma_address(sg);
+ mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
mr->ibmr.length = 0;
mr->ndescs = sg_nents;
for_each_sg(sgl, sg, sg_nents, i) {
if (unlikely(i > mr->max_descs))
break;
- klms[i].va = cpu_to_be64(sg_dma_address(sg));
- klms[i].bcount = cpu_to_be32(sg_dma_len(sg));
+ klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
+ klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
klms[i].key = cpu_to_be32(lkey);
mr->ibmr.length += sg_dma_len(sg);
+
+ sg_offset = 0;
}
+ if (sg_offset_p)
+ *sg_offset_p = sg_offset;
+
return i;
}
@@ -1788,9 +1799,8 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
return 0;
}
-int mlx5_ib_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents)
+int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset)
{
struct mlx5_ib_mr *mr = to_mmr(ibmr);
int n;
@@ -1802,9 +1812,10 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr,
DMA_TO_DEVICE);
if (mr->access_mode == MLX5_ACCESS_MODE_KLM)
- n = mlx5_ib_sg_to_klms(mr, sg, sg_nents);
+ n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
else
- n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page);
+ n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
+ mlx5_set_page);
ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
mr->desc_size * mr->max_descs,
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 8dee8bc1e0fe..affc3f6598ca 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -77,6 +77,10 @@ struct mlx5_wqe_eth_pad {
u8 rsvd0[16];
};
+static void get_cqs(enum ib_qp_type qp_type,
+ struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
+ struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq);
+
static int is_qp0(enum ib_qp_type qp_type)
{
return qp_type == IB_QPT_SMI;
@@ -235,6 +239,8 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
qp->rq.max_gs = 0;
qp->rq.wqe_cnt = 0;
qp->rq.wqe_shift = 0;
+ cap->max_recv_wr = 0;
+ cap->max_recv_sge = 0;
} else {
if (ucmd) {
qp->rq.wqe_cnt = ucmd->rq_wqe_count;
@@ -607,6 +613,11 @@ static int to_mlx5_st(enum ib_qp_type type)
}
}
+static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq,
+ struct mlx5_ib_cq *recv_cq);
+static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
+ struct mlx5_ib_cq *recv_cq);
+
static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
{
return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
@@ -647,6 +658,71 @@ err_umem:
return err;
}
+static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq)
+{
+ struct mlx5_ib_ucontext *context;
+
+ context = to_mucontext(pd->uobject->context);
+ mlx5_ib_db_unmap_user(context, &rwq->db);
+ if (rwq->umem)
+ ib_umem_release(rwq->umem);
+}
+
+static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+ struct mlx5_ib_rwq *rwq,
+ struct mlx5_ib_create_wq *ucmd)
+{
+ struct mlx5_ib_ucontext *context;
+ int page_shift = 0;
+ int npages;
+ u32 offset = 0;
+ int ncont = 0;
+ int err;
+
+ if (!ucmd->buf_addr)
+ return -EINVAL;
+
+ context = to_mucontext(pd->uobject->context);
+ rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
+ rwq->buf_size, 0, 0);
+ if (IS_ERR(rwq->umem)) {
+ mlx5_ib_dbg(dev, "umem_get failed\n");
+ err = PTR_ERR(rwq->umem);
+ return err;
+ }
+
+ mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, &npages, &page_shift,
+ &ncont, NULL);
+ err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift,
+ &rwq->rq_page_offset);
+ if (err) {
+ mlx5_ib_warn(dev, "bad offset\n");
+ goto err_umem;
+ }
+
+ rwq->rq_num_pas = ncont;
+ rwq->page_shift = page_shift;
+ rwq->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+ rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE);
+
+ mlx5_ib_dbg(dev, "addr 0x%llx, size %zd, npages %d, page_shift %d, ncont %d, offset %d\n",
+ (unsigned long long)ucmd->buf_addr, rwq->buf_size,
+ npages, page_shift, ncont, offset);
+
+ err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
+ if (err) {
+ mlx5_ib_dbg(dev, "map failed\n");
+ goto err_umem;
+ }
+
+ rwq->create_type = MLX5_WQ_USER;
+ return 0;
+
+err_umem:
+ ib_umem_release(rwq->umem);
+ return err;
+}
+
static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
struct mlx5_ib_qp *qp, struct ib_udata *udata,
struct ib_qp_init_attr *attr,
@@ -1028,6 +1104,7 @@ static int get_rq_pas_size(void *qpc)
static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
struct mlx5_ib_rq *rq, void *qpin)
{
+ struct mlx5_ib_qp *mqp = rq->base.container_mibqp;
__be64 *pas;
__be64 *qp_pas;
void *in;
@@ -1051,6 +1128,9 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index));
MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv));
+ if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS)
+ MLX5_SET(rqc, rqc, scatter_fcs, 1);
+
wq = MLX5_ADDR_OF(rqc, rqc, wq);
MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
MLX5_SET(wq, wq, end_padding_mode,
@@ -1136,11 +1216,12 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
}
if (qp->rq.wqe_cnt) {
+ rq->base.container_mibqp = qp;
+
err = create_raw_packet_qp_rq(dev, rq, in);
if (err)
goto err_destroy_sq;
- rq->base.container_mibqp = qp;
err = create_raw_packet_qp_tir(dev, rq, tdn);
if (err)
@@ -1194,6 +1275,188 @@ static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp,
rq->doorbell = &qp->db;
}
+static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+ mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn);
+}
+
+static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+ struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct ib_uobject *uobj = pd->uobject;
+ struct ib_ucontext *ucontext = uobj->context;
+ struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+ struct mlx5_ib_create_qp_resp resp = {};
+ int inlen;
+ int err;
+ u32 *in;
+ void *tirc;
+ void *hfso;
+ u32 selected_fields = 0;
+ size_t min_resp_len;
+ u32 tdn = mucontext->tdn;
+ struct mlx5_ib_create_qp_rss ucmd = {};
+ size_t required_cmd_sz;
+
+ if (init_attr->qp_type != IB_QPT_RAW_PACKET)
+ return -EOPNOTSUPP;
+
+ if (init_attr->create_flags || init_attr->send_cq)
+ return -EINVAL;
+
+ min_resp_len = offsetof(typeof(resp), uuar_index) + sizeof(resp.uuar_index);
+ if (udata->outlen < min_resp_len)
+ return -EINVAL;
+
+ required_cmd_sz = offsetof(typeof(ucmd), reserved1) + sizeof(ucmd.reserved1);
+ if (udata->inlen < required_cmd_sz) {
+ mlx5_ib_dbg(dev, "invalid inlen\n");
+ return -EINVAL;
+ }
+
+ if (udata->inlen > sizeof(ucmd) &&
+ !ib_is_udata_cleared(udata, sizeof(ucmd),
+ udata->inlen - sizeof(ucmd))) {
+ mlx5_ib_dbg(dev, "inlen is not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+ mlx5_ib_dbg(dev, "copy failed\n");
+ return -EFAULT;
+ }
+
+ if (ucmd.comp_mask) {
+ mlx5_ib_dbg(dev, "invalid comp mask\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)) || ucmd.reserved1) {
+ mlx5_ib_dbg(dev, "invalid reserved\n");
+ return -EOPNOTSUPP;
+ }
+
+ err = ib_copy_to_udata(udata, &resp, min_resp_len);
+ if (err) {
+ mlx5_ib_dbg(dev, "copy failed\n");
+ return -EINVAL;
+ }
+
+ inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
+ tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+ MLX5_SET(tirc, tirc, disp_type,
+ MLX5_TIRC_DISP_TYPE_INDIRECT);
+ MLX5_SET(tirc, tirc, indirect_table,
+ init_attr->rwq_ind_tbl->ind_tbl_num);
+ MLX5_SET(tirc, tirc, transport_domain, tdn);
+
+ hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+ switch (ucmd.rx_hash_function) {
+ case MLX5_RX_HASH_FUNC_TOEPLITZ:
+ {
+ void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
+ size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key);
+
+ if (len != ucmd.rx_key_len) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
+ MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+ memcpy(rss_key, ucmd.rx_hash_key, len);
+ break;
+ }
+ default:
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ if (!ucmd.rx_hash_fields_mask) {
+ /* special case when this TIR serves as steering entry without hashing */
+ if (!init_attr->rwq_ind_tbl->log_ind_tbl_size)
+ goto create_tir;
+ err = -EINVAL;
+ goto err;
+ }
+
+ if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) &&
+ ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */
+ if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4))
+ MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+ MLX5_L3_PROT_TYPE_IPV4);
+ else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+ MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+ MLX5_L3_PROT_TYPE_IPV6);
+
+ if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) &&
+ ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */
+ if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP))
+ MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+ MLX5_L4_PROT_TYPE_TCP);
+ else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+ MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+ MLX5_L4_PROT_TYPE_UDP);
+
+ if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6))
+ selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP;
+
+ if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+ selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP;
+
+ if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP))
+ selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT;
+
+ if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) ||
+ (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+ selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT;
+
+ MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields);
+
+create_tir:
+ err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn);
+
+ if (err)
+ goto err;
+
+ kvfree(in);
+ /* qpn is reserved for that QP */
+ qp->trans_qp.base.mqp.qpn = 0;
+ qp->flags |= MLX5_IB_QP_RSS;
+ return 0;
+
+err:
+ kvfree(in);
+ return err;
+}
+
static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata, struct mlx5_ib_qp *qp)
@@ -1204,6 +1467,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
struct mlx5_ib_create_qp_resp resp;
struct mlx5_create_qp_mbox_in *in;
struct mlx5_ib_create_qp ucmd;
+ struct mlx5_ib_cq *send_cq;
+ struct mlx5_ib_cq *recv_cq;
+ unsigned long flags;
int inlen = sizeof(*in);
int err;
u32 uidx = MLX5_IB_DEFAULT_UIDX;
@@ -1220,6 +1486,14 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
spin_lock_init(&qp->sq.lock);
spin_lock_init(&qp->rq.lock);
+ if (init_attr->rwq_ind_tbl) {
+ if (!udata)
+ return -ENOSYS;
+
+ err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata);
+ return err;
+ }
+
if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
if (!MLX5_CAP_GEN(mdev, block_lb_mc)) {
mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n");
@@ -1252,6 +1526,19 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
return -EOPNOTSUPP;
}
+ if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
+ if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+ mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs");
+ return -EOPNOTSUPP;
+ }
+ if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) ||
+ !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
+ mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n");
+ return -EOPNOTSUPP;
+ }
+ qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS;
+ }
+
if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
@@ -1440,6 +1727,23 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
base->container_mibqp = qp;
base->mqp.event = mlx5_ib_qp_event;
+ get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq,
+ &send_cq, &recv_cq);
+ spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+ mlx5_ib_lock_cqs(send_cq, recv_cq);
+ /* Maintain device to QPs access, needed for further handling via reset
+ * flow
+ */
+ list_add_tail(&qp->qps_list, &dev->qp_list);
+ /* Maintain CQ to QPs access, needed for further handling via reset flow
+ */
+ if (send_cq)
+ list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp);
+ if (recv_cq)
+ list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp);
+ mlx5_ib_unlock_cqs(send_cq, recv_cq);
+ spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
return 0;
err_create:
@@ -1458,23 +1762,23 @@ static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv
if (send_cq) {
if (recv_cq) {
if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
- spin_lock_irq(&send_cq->lock);
+ spin_lock(&send_cq->lock);
spin_lock_nested(&recv_cq->lock,
SINGLE_DEPTH_NESTING);
} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
- spin_lock_irq(&send_cq->lock);
+ spin_lock(&send_cq->lock);
__acquire(&recv_cq->lock);
} else {
- spin_lock_irq(&recv_cq->lock);
+ spin_lock(&recv_cq->lock);
spin_lock_nested(&send_cq->lock,
SINGLE_DEPTH_NESTING);
}
} else {
- spin_lock_irq(&send_cq->lock);
+ spin_lock(&send_cq->lock);
__acquire(&recv_cq->lock);
}
} else if (recv_cq) {
- spin_lock_irq(&recv_cq->lock);
+ spin_lock(&recv_cq->lock);
__acquire(&send_cq->lock);
} else {
__acquire(&send_cq->lock);
@@ -1489,21 +1793,21 @@ static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *re
if (recv_cq) {
if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
spin_unlock(&recv_cq->lock);
- spin_unlock_irq(&send_cq->lock);
+ spin_unlock(&send_cq->lock);
} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
__release(&recv_cq->lock);
- spin_unlock_irq(&send_cq->lock);
+ spin_unlock(&send_cq->lock);
} else {
spin_unlock(&send_cq->lock);
- spin_unlock_irq(&recv_cq->lock);
+ spin_unlock(&recv_cq->lock);
}
} else {
__release(&recv_cq->lock);
- spin_unlock_irq(&send_cq->lock);
+ spin_unlock(&send_cq->lock);
}
} else if (recv_cq) {
__release(&send_cq->lock);
- spin_unlock_irq(&recv_cq->lock);
+ spin_unlock(&recv_cq->lock);
} else {
__release(&recv_cq->lock);
__release(&send_cq->lock);
@@ -1515,17 +1819,18 @@ static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
return to_mpd(qp->ibqp.pd);
}
-static void get_cqs(struct mlx5_ib_qp *qp,
+static void get_cqs(enum ib_qp_type qp_type,
+ struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
{
- switch (qp->ibqp.qp_type) {
+ switch (qp_type) {
case IB_QPT_XRC_TGT:
*send_cq = NULL;
*recv_cq = NULL;
break;
case MLX5_IB_QPT_REG_UMR:
case IB_QPT_XRC_INI:
- *send_cq = to_mcq(qp->ibqp.send_cq);
+ *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
*recv_cq = NULL;
break;
@@ -1537,8 +1842,8 @@ static void get_cqs(struct mlx5_ib_qp *qp,
case IB_QPT_RAW_IPV6:
case IB_QPT_RAW_ETHERTYPE:
case IB_QPT_RAW_PACKET:
- *send_cq = to_mcq(qp->ibqp.send_cq);
- *recv_cq = to_mcq(qp->ibqp.recv_cq);
+ *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
+ *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL;
break;
case IB_QPT_MAX:
@@ -1557,8 +1862,14 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
struct mlx5_ib_cq *send_cq, *recv_cq;
struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct mlx5_modify_qp_mbox_in *in;
+ unsigned long flags;
int err;
+ if (qp->ibqp.rwq_ind_tbl) {
+ destroy_rss_raw_qp_tir(dev, qp);
+ return;
+ }
+
base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ?
&qp->raw_packet_qp.rq.base :
&qp->trans_qp.base;
@@ -1582,17 +1893,28 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
base->mqp.qpn);
}
- get_cqs(qp, &send_cq, &recv_cq);
+ get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
+ &send_cq, &recv_cq);
+
+ spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+ mlx5_ib_lock_cqs(send_cq, recv_cq);
+ /* del from lists under both locks above to protect reset flow paths */
+ list_del(&qp->qps_list);
+ if (send_cq)
+ list_del(&qp->cq_send_list);
+
+ if (recv_cq)
+ list_del(&qp->cq_recv_list);
if (qp->create_type == MLX5_QP_KERNEL) {
- mlx5_ib_lock_cqs(send_cq, recv_cq);
__mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
if (send_cq != recv_cq)
__mlx5_ib_cq_clean(send_cq, base->mqp.qpn,
NULL);
- mlx5_ib_unlock_cqs(send_cq, recv_cq);
}
+ mlx5_ib_unlock_cqs(send_cq, recv_cq);
+ spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
destroy_raw_packet_qp(dev, qp);
@@ -1833,13 +2155,15 @@ static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev,
static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
const struct ib_ah_attr *ah,
struct mlx5_qp_path *path, u8 port, int attr_mask,
- u32 path_flags, const struct ib_qp_attr *attr)
+ u32 path_flags, const struct ib_qp_attr *attr,
+ bool alt)
{
enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port);
int err;
if (attr_mask & IB_QP_PKEY_INDEX)
- path->pkey_index = attr->pkey_index;
+ path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index :
+ attr->pkey_index);
if (ah->ah_flags & IB_AH_GRH) {
if (ah->grh.sgid_index >=
@@ -1859,9 +2183,9 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
ah->grh.sgid_index);
path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4;
} else {
- path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
- path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 :
- 0;
+ path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
+ path->fl_free_ar |=
+ (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0;
path->rlid = cpu_to_be16(ah->dlid);
path->grh_mlid = ah->src_path_bits & 0x7f;
if (ah->ah_flags & IB_AH_GRH)
@@ -1885,7 +2209,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
path->port = port;
if (attr_mask & IB_QP_TIMEOUT)
- path->ackto_lt = attr->timeout << 3;
+ path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3;
if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt)
return modify_raw_packet_eth_prio(dev->mdev,
@@ -2246,7 +2570,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num);
if (attr_mask & IB_QP_PKEY_INDEX)
- context->pri_path.pkey_index = attr->pkey_index;
+ context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index);
/* todo implement counter_index functionality */
@@ -2259,7 +2583,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (attr_mask & IB_QP_AV) {
err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path,
attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
- attr_mask, 0, attr);
+ attr_mask, 0, attr, false);
if (err)
goto out;
}
@@ -2270,13 +2594,16 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (attr_mask & IB_QP_ALT_PATH) {
err = mlx5_set_path(dev, qp, &attr->alt_ah_attr,
&context->alt_path,
- attr->alt_port_num, attr_mask, 0, attr);
+ attr->alt_port_num,
+ attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT,
+ 0, attr, true);
if (err)
goto out;
}
pd = get_pd(qp);
- get_cqs(qp, &send_cq, &recv_cq);
+ get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
+ &send_cq, &recv_cq);
context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
@@ -2325,6 +2652,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
else
sqd_event = 0;
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+ u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num :
+ qp->port) - 1;
+ struct mlx5_ib_port *mibport = &dev->port[port_num];
+
+ context->qp_counter_set_usr_page |=
+ cpu_to_be32((u32)(mibport->q_cnt_id) << 24);
+ }
+
if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
context->sq_crq_size |= cpu_to_be16(1 << 4);
@@ -2415,6 +2751,9 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
int port;
enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
+ if (ibqp->rwq_ind_tbl)
+ return -ENOSYS;
+
if (unlikely(ibqp->qp_type == IB_QPT_GSI))
return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask);
@@ -3308,10 +3647,11 @@ static u8 get_fence(u8 fence, struct ib_send_wr *wr)
return MLX5_FENCE_MODE_SMALL_AND_FENCE;
else
return fence;
-
- } else {
- return 0;
+ } else if (unlikely(wr->send_flags & IB_SEND_FENCE)) {
+ return MLX5_FENCE_MODE_FENCE;
}
+
+ return 0;
}
static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
@@ -3319,12 +3659,8 @@ static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
struct ib_send_wr *wr, unsigned *idx,
int *size, int nreq)
{
- int err = 0;
-
- if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) {
- err = -ENOMEM;
- return err;
- }
+ if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)))
+ return -ENOMEM;
*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
*seg = mlx5_get_send_wqe(qp, *idx);
@@ -3340,7 +3676,7 @@ static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
*seg += sizeof(**ctrl);
*size = sizeof(**ctrl) / 16;
- return err;
+ return 0;
}
static void finish_wqe(struct mlx5_ib_qp *qp,
@@ -3372,6 +3708,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
{
struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */
struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx5_core_dev *mdev = dev->mdev;
struct mlx5_ib_qp *qp;
struct mlx5_ib_mr *mr;
struct mlx5_wqe_data_seg *dpseg;
@@ -3399,6 +3736,13 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
spin_lock_irqsave(&qp->sq.lock, flags);
+ if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+ err = -EIO;
+ *bad_wr = wr;
+ nreq = 0;
+ goto out;
+ }
+
for (nreq = 0; wr; nreq++, wr = wr->next) {
if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
mlx5_ib_warn(dev, "\n");
@@ -3411,7 +3755,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
num_sge = wr->num_sge;
if (unlikely(num_sge > qp->sq.max_gs)) {
mlx5_ib_warn(dev, "\n");
- err = -ENOMEM;
+ err = -EINVAL;
*bad_wr = wr;
goto out;
}
@@ -3700,6 +4044,8 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
struct mlx5_ib_qp *qp = to_mqp(ibqp);
struct mlx5_wqe_data_seg *scat;
struct mlx5_rwqe_sig *sig;
+ struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx5_core_dev *mdev = dev->mdev;
unsigned long flags;
int err = 0;
int nreq;
@@ -3711,6 +4057,13 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
spin_lock_irqsave(&qp->rq.lock, flags);
+ if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+ err = -EIO;
+ *bad_wr = wr;
+ nreq = 0;
+ goto out;
+ }
+
ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
for (nreq = 0; wr; nreq++, wr = wr->next) {
@@ -3995,11 +4348,12 @@ static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
- qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f;
+ qp_attr->alt_pkey_index =
+ be16_to_cpu(context->alt_path.pkey_index);
qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num;
}
- qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f;
+ qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index);
qp_attr->port_num = context->pri_path.port;
/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
@@ -4029,6 +4383,9 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
int err = 0;
u8 raw_packet_qp_state;
+ if (ibqp->rwq_ind_tbl)
+ return -ENOSYS;
+
if (unlikely(ibqp->qp_type == IB_QPT_GSI))
return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask,
qp_init_attr);
@@ -4061,17 +4418,19 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
qp_attr->cap.max_recv_sge = qp->rq.max_gs;
if (!ibqp->uobject) {
- qp_attr->cap.max_send_wr = qp->sq.wqe_cnt;
+ qp_attr->cap.max_send_wr = qp->sq.max_post;
qp_attr->cap.max_send_sge = qp->sq.max_gs;
+ qp_init_attr->qp_context = ibqp->qp_context;
} else {
qp_attr->cap.max_send_wr = 0;
qp_attr->cap.max_send_sge = 0;
}
- /* We don't support inline sends for kernel QPs (yet), and we
- * don't know what userspace's value should be.
- */
- qp_attr->cap.max_inline_data = 0;
+ qp_init_attr->qp_type = ibqp->qp_type;
+ qp_init_attr->recv_cq = ibqp->recv_cq;
+ qp_init_attr->send_cq = ibqp->send_cq;
+ qp_init_attr->srq = ibqp->srq;
+ qp_attr->cap.max_inline_data = qp->max_inline_data;
qp_init_attr->cap = qp_attr->cap;
@@ -4136,3 +4495,322 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
return 0;
}
+
+static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr)
+{
+ struct mlx5_ib_dev *dev;
+ __be64 *rq_pas0;
+ void *in;
+ void *rqc;
+ void *wq;
+ int inlen;
+ int err;
+
+ dev = to_mdev(pd->device);
+
+ inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas;
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
+ rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+ MLX5_SET(rqc, rqc, mem_rq_type,
+ MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE);
+ MLX5_SET(rqc, rqc, user_index, rwq->user_index);
+ MLX5_SET(rqc, rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn);
+ MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
+ MLX5_SET(rqc, rqc, flush_in_error_en, 1);
+ wq = MLX5_ADDR_OF(rqc, rqc, wq);
+ MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+ MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
+ MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride);
+ MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size);
+ MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn);
+ MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset);
+ MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size);
+ MLX5_SET(wq, wq, wq_signature, rwq->wq_sig);
+ MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma);
+ rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
+ mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
+ err = mlx5_core_create_rq(dev->mdev, in, inlen, &rwq->rqn);
+ kvfree(in);
+ return err;
+}
+
+static int set_user_rq_size(struct mlx5_ib_dev *dev,
+ struct ib_wq_init_attr *wq_init_attr,
+ struct mlx5_ib_create_wq *ucmd,
+ struct mlx5_ib_rwq *rwq)
+{
+ /* Sanity check RQ size before proceeding */
+ if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz)))
+ return -EINVAL;
+
+ if (!ucmd->rq_wqe_count)
+ return -EINVAL;
+
+ rwq->wqe_count = ucmd->rq_wqe_count;
+ rwq->wqe_shift = ucmd->rq_wqe_shift;
+ rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift);
+ rwq->log_rq_stride = rwq->wqe_shift;
+ rwq->log_rq_size = ilog2(rwq->wqe_count);
+ return 0;
+}
+
+static int prepare_user_rq(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr,
+ struct ib_udata *udata,
+ struct mlx5_ib_rwq *rwq)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5_ib_create_wq ucmd = {};
+ int err;
+ size_t required_cmd_sz;
+
+ required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
+ if (udata->inlen < required_cmd_sz) {
+ mlx5_ib_dbg(dev, "invalid inlen\n");
+ return -EINVAL;
+ }
+
+ if (udata->inlen > sizeof(ucmd) &&
+ !ib_is_udata_cleared(udata, sizeof(ucmd),
+ udata->inlen - sizeof(ucmd))) {
+ mlx5_ib_dbg(dev, "inlen is not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+ mlx5_ib_dbg(dev, "copy failed\n");
+ return -EFAULT;
+ }
+
+ if (ucmd.comp_mask) {
+ mlx5_ib_dbg(dev, "invalid comp mask\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (ucmd.reserved) {
+ mlx5_ib_dbg(dev, "invalid reserved\n");
+ return -EOPNOTSUPP;
+ }
+
+ err = set_user_rq_size(dev, init_attr, &ucmd, rwq);
+ if (err) {
+ mlx5_ib_dbg(dev, "err %d\n", err);
+ return err;
+ }
+
+ err = create_user_rq(dev, pd, rwq, &ucmd);
+ if (err) {
+ mlx5_ib_dbg(dev, "err %d\n", err);
+ if (err)
+ return err;
+ }
+
+ rwq->user_index = ucmd.user_index;
+ return 0;
+}
+
+struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx5_ib_dev *dev;
+ struct mlx5_ib_rwq *rwq;
+ struct mlx5_ib_create_wq_resp resp = {};
+ size_t min_resp_len;
+ int err;
+
+ if (!udata)
+ return ERR_PTR(-ENOSYS);
+
+ min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+ if (udata->outlen && udata->outlen < min_resp_len)
+ return ERR_PTR(-EINVAL);
+
+ dev = to_mdev(pd->device);
+ switch (init_attr->wq_type) {
+ case IB_WQT_RQ:
+ rwq = kzalloc(sizeof(*rwq), GFP_KERNEL);
+ if (!rwq)
+ return ERR_PTR(-ENOMEM);
+ err = prepare_user_rq(pd, init_attr, udata, rwq);
+ if (err)
+ goto err;
+ err = create_rq(rwq, pd, init_attr);
+ if (err)
+ goto err_user_rq;
+ break;
+ default:
+ mlx5_ib_dbg(dev, "unsupported wq type %d\n",
+ init_attr->wq_type);
+ return ERR_PTR(-EINVAL);
+ }
+
+ rwq->ibwq.wq_num = rwq->rqn;
+ rwq->ibwq.state = IB_WQS_RESET;
+ if (udata->outlen) {
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+ err = ib_copy_to_udata(udata, &resp, resp.response_length);
+ if (err)
+ goto err_copy;
+ }
+
+ return &rwq->ibwq;
+
+err_copy:
+ mlx5_core_destroy_rq(dev->mdev, rwq->rqn);
+err_user_rq:
+ destroy_user_rq(pd, rwq);
+err:
+ kfree(rwq);
+ return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_wq(struct ib_wq *wq)
+{
+ struct mlx5_ib_dev *dev = to_mdev(wq->device);
+ struct mlx5_ib_rwq *rwq = to_mrwq(wq);
+
+ mlx5_core_destroy_rq(dev->mdev, rwq->rqn);
+ destroy_user_rq(wq->pd, rwq);
+ kfree(rwq);
+
+ return 0;
+}
+
+struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
+ struct ib_rwq_ind_table_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ struct mlx5_ib_rwq_ind_table *rwq_ind_tbl;
+ int sz = 1 << init_attr->log_ind_tbl_size;
+ struct mlx5_ib_create_rwq_ind_tbl_resp resp = {};
+ size_t min_resp_len;
+ int inlen;
+ int err;
+ int i;
+ u32 *in;
+ void *rqtc;
+
+ if (udata->inlen > 0 &&
+ !ib_is_udata_cleared(udata, 0,
+ udata->inlen))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+ if (udata->outlen && udata->outlen < min_resp_len)
+ return ERR_PTR(-EINVAL);
+
+ rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL);
+ if (!rwq_ind_tbl)
+ return ERR_PTR(-ENOMEM);
+
+ inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
+ in = mlx5_vzalloc(inlen);
+ if (!in) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+ MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+ MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
+
+ for (i = 0; i < sz; i++)
+ MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num);
+
+ err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn);
+ kvfree(in);
+
+ if (err)
+ goto err;
+
+ rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn;
+ if (udata->outlen) {
+ resp.response_length = offsetof(typeof(resp), response_length) +
+ sizeof(resp.response_length);
+ err = ib_copy_to_udata(udata, &resp, resp.response_length);
+ if (err)
+ goto err_copy;
+ }
+
+ return &rwq_ind_tbl->ib_rwq_ind_tbl;
+
+err_copy:
+ mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+err:
+ kfree(rwq_ind_tbl);
+ return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+ struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl);
+ struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device);
+
+ mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn);
+
+ kfree(rwq_ind_tbl);
+ return 0;
+}
+
+int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+ u32 wq_attr_mask, struct ib_udata *udata)
+{
+ struct mlx5_ib_dev *dev = to_mdev(wq->device);
+ struct mlx5_ib_rwq *rwq = to_mrwq(wq);
+ struct mlx5_ib_modify_wq ucmd = {};
+ size_t required_cmd_sz;
+ int curr_wq_state;
+ int wq_state;
+ int inlen;
+ int err;
+ void *rqc;
+ void *in;
+
+ required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
+ if (udata->inlen < required_cmd_sz)
+ return -EINVAL;
+
+ if (udata->inlen > sizeof(ucmd) &&
+ !ib_is_udata_cleared(udata, sizeof(ucmd),
+ udata->inlen - sizeof(ucmd)))
+ return -EOPNOTSUPP;
+
+ if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
+ return -EFAULT;
+
+ if (ucmd.comp_mask || ucmd.reserved)
+ return -EOPNOTSUPP;
+
+ inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
+ rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+ curr_wq_state = (wq_attr_mask & IB_WQ_CUR_STATE) ?
+ wq_attr->curr_wq_state : wq->state;
+ wq_state = (wq_attr_mask & IB_WQ_STATE) ?
+ wq_attr->wq_state : curr_wq_state;
+ if (curr_wq_state == IB_WQS_ERR)
+ curr_wq_state = MLX5_RQC_STATE_ERR;
+ if (wq_state == IB_WQS_ERR)
+ wq_state = MLX5_RQC_STATE_ERR;
+ MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state);
+ MLX5_SET(rqc, rqc, state, wq_state);
+
+ err = mlx5_core_modify_rq(dev->mdev, rwq->rqn, in, inlen);
+ kvfree(in);
+ if (!err)
+ rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state;
+
+ return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 3b2ddd64a371..ed6ac52355f1 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -74,14 +74,12 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
}
static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
- struct mlx5_create_srq_mbox_in **in,
- struct ib_udata *udata, int buf_size, int *inlen,
- int is_xrc)
+ struct mlx5_srq_attr *in,
+ struct ib_udata *udata, int buf_size)
{
struct mlx5_ib_dev *dev = to_mdev(pd->device);
struct mlx5_ib_create_srq ucmd = {};
size_t ucmdlen;
- void *xsrqc;
int err;
int npages;
int page_shift;
@@ -104,7 +102,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
udata->inlen - sizeof(ucmd)))
return -EINVAL;
- if (is_xrc) {
+ if (in->type == IB_SRQT_XRC) {
err = get_srq_user_index(to_mucontext(pd->uobject->context),
&ucmd, udata->inlen, &uidx);
if (err)
@@ -130,14 +128,13 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
goto err_umem;
}
- *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
- *in = mlx5_vzalloc(*inlen);
- if (!(*in)) {
+ in->pas = mlx5_vzalloc(sizeof(*in->pas) * ncont);
+ if (!in->pas) {
err = -ENOMEM;
goto err_umem;
}
- mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0);
+ mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0);
err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
ucmd.db_addr, &srq->db);
@@ -146,20 +143,16 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
goto err_in;
}
- (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
- (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
-
- if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
- is_xrc){
- xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
- xrc_srq_context_entry);
- MLX5_SET(xrc_srqc, xsrqc, user_index, uidx);
- }
+ in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+ in->page_offset = offset;
+ if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
+ in->type == IB_SRQT_XRC)
+ in->user_index = uidx;
return 0;
err_in:
- kvfree(*in);
+ kvfree(in->pas);
err_umem:
ib_umem_release(srq->umem);
@@ -168,15 +161,13 @@ err_umem:
}
static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
- struct mlx5_create_srq_mbox_in **in, int buf_size,
- int *inlen, int is_xrc)
+ struct mlx5_srq_attr *in, int buf_size)
{
int err;
int i;
struct mlx5_wqe_srq_next_seg *next;
int page_shift;
int npages;
- void *xsrqc;
err = mlx5_db_alloc(dev->mdev, &srq->db);
if (err) {
@@ -204,13 +195,12 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT));
mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n",
buf_size, page_shift, srq->buf.npages, npages);
- *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages;
- *in = mlx5_vzalloc(*inlen);
- if (!*in) {
+ in->pas = mlx5_vzalloc(sizeof(*in->pas) * npages);
+ if (!in->pas) {
err = -ENOMEM;
goto err_buf;
}
- mlx5_fill_page_array(&srq->buf, (*in)->pas);
+ mlx5_fill_page_array(&srq->buf, in->pas);
srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL);
if (!srq->wrid) {
@@ -221,20 +211,15 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
}
srq->wq_sig = !!srq_signature;
- (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
-
- if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
- is_xrc){
- xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
- xrc_srq_context_entry);
- /* 0xffffff means we ask to work with cqe version 0 */
- MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX);
- }
+ in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+ if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 &&
+ in->type == IB_SRQT_XRC)
+ in->user_index = MLX5_IB_DEFAULT_UIDX;
return 0;
err_in:
- kvfree(*in);
+ kvfree(in->pas);
err_buf:
mlx5_buf_free(dev->mdev, &srq->buf);
@@ -267,10 +252,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
int desc_size;
int buf_size;
int err;
- struct mlx5_create_srq_mbox_in *uninitialized_var(in);
- int uninitialized_var(inlen);
- int is_xrc;
- u32 flgs, xrcdn;
+ struct mlx5_srq_attr in = {0};
__u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
/* Sanity check SRQ size before proceeding */
@@ -302,14 +284,10 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
srq->msrq.max_avail_gather);
- is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
-
if (pd->uobject)
- err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen,
- is_xrc);
+ err = create_srq_user(pd, srq, &in, udata, buf_size);
else
- err = create_srq_kernel(dev, srq, &in, buf_size, &inlen,
- is_xrc);
+ err = create_srq_kernel(dev, srq, &in, buf_size);
if (err) {
mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
@@ -317,23 +295,23 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
goto err_srq;
}
- in->ctx.state_log_sz = ilog2(srq->msrq.max);
- flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24;
- xrcdn = 0;
- if (is_xrc) {
- xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
- in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn);
+ in.type = init_attr->srq_type;
+ in.log_size = ilog2(srq->msrq.max);
+ in.wqe_shift = srq->msrq.wqe_shift - 4;
+ if (srq->wq_sig)
+ in.flags |= MLX5_SRQ_FLAG_WQ_SIG;
+ if (init_attr->srq_type == IB_SRQT_XRC) {
+ in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
+ in.cqn = to_mcq(init_attr->ext.xrc.cq)->mcq.cqn;
} else if (init_attr->srq_type == IB_SRQT_BASIC) {
- xrcdn = to_mxrcd(dev->devr.x0)->xrcdn;
- in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn);
+ in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn;
+ in.cqn = to_mcq(dev->devr.c0)->mcq.cqn;
}
- in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF));
-
- in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn);
- in->ctx.db_record = cpu_to_be64(srq->db.dma);
- err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen, is_xrc);
- kvfree(in);
+ in.pd = to_mpd(pd)->pdn;
+ in.db_record = srq->db.dma;
+ err = mlx5_core_create_srq(dev->mdev, &srq->msrq, &in);
+ kvfree(in.pas);
if (err) {
mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
goto err_usr_kern_srq;
@@ -401,7 +379,7 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
struct mlx5_ib_srq *srq = to_msrq(ibsrq);
int ret;
- struct mlx5_query_srq_mbox_out *out;
+ struct mlx5_srq_attr *out;
out = kzalloc(sizeof(*out), GFP_KERNEL);
if (!out)
@@ -411,7 +389,7 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
if (ret)
goto out_box;
- srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm);
+ srq_attr->srq_limit = out->lwm;
srq_attr->max_wr = srq->msrq.max - 1;
srq_attr->max_sge = srq->msrq.max_gs;
@@ -458,6 +436,8 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
struct mlx5_ib_srq *srq = to_msrq(ibsrq);
struct mlx5_wqe_srq_next_seg *next;
struct mlx5_wqe_data_seg *scat;
+ struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+ struct mlx5_core_dev *mdev = dev->mdev;
unsigned long flags;
int err = 0;
int nreq;
@@ -465,6 +445,12 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
spin_lock_irqsave(&srq->lock, flags);
+ if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+ err = -EIO;
+ *bad_wr = wr;
+ goto out;
+ }
+
for (nreq = 0; wr; nreq++, wr = wr->next) {
if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
err = -EINVAL;
@@ -507,7 +493,7 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
*srq->db.db = cpu_to_be32(srq->wqe_ctr);
}
-
+out:
spin_unlock_irqrestore(&srq->lock, flags);
return err;
diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h
index 61bc308bb802..188dac4301b5 100644
--- a/drivers/infiniband/hw/mlx5/user.h
+++ b/drivers/infiniband/hw/mlx5/user.h
@@ -46,6 +46,10 @@ enum {
MLX5_SRQ_FLAG_SIGNATURE = 1 << 0,
};
+enum {
+ MLX5_WQ_FLAG_SIGNATURE = 1 << 0,
+};
+
/* Increment this value if any changes that break userspace ABI
* compatibility are made.
@@ -79,6 +83,10 @@ enum mlx5_ib_alloc_ucontext_resp_mask {
MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0,
};
+enum mlx5_user_cmds_supp_uhw {
+ MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE = 1 << 0,
+};
+
struct mlx5_ib_alloc_ucontext_resp {
__u32 qp_tab_size;
__u32 bf_reg_size;
@@ -94,8 +102,8 @@ struct mlx5_ib_alloc_ucontext_resp {
__u32 comp_mask;
__u32 response_length;
__u8 cqe_version;
- __u8 reserved2;
- __u16 reserved3;
+ __u8 cmds_supp_uhw;
+ __u16 reserved2;
__u64 hca_core_clock_offset;
};
@@ -103,6 +111,22 @@ struct mlx5_ib_alloc_pd_resp {
__u32 pdn;
};
+struct mlx5_ib_tso_caps {
+ __u32 max_tso; /* Maximum tso payload size in bytes */
+
+ /* Corresponding bit will be set if qp type from
+ * 'enum ib_qp_type' is supported, e.g.
+ * supported_qpts |= 1 << IB_QPT_UD
+ */
+ __u32 supported_qpts;
+};
+
+struct mlx5_ib_query_device_resp {
+ __u32 comp_mask;
+ __u32 response_length;
+ struct mlx5_ib_tso_caps tso_caps;
+};
+
struct mlx5_ib_create_cq {
__u64 buf_addr;
__u64 db_addr;
@@ -148,6 +172,40 @@ struct mlx5_ib_create_qp {
__u64 sq_buf_addr;
};
+/* RX Hash function flags */
+enum mlx5_rx_hash_function_flags {
+ MLX5_RX_HASH_FUNC_TOEPLITZ = 1 << 0,
+};
+
+/*
+ * RX Hash flags, these flags allows to set which incoming packet's field should
+ * participates in RX Hash. Each flag represent certain packet's field,
+ * when the flag is set the field that is represented by the flag will
+ * participate in RX Hash calculation.
+ * Note: *IPV4 and *IPV6 flags can't be enabled together on the same QP
+ * and *TCP and *UDP flags can't be enabled together on the same QP.
+*/
+enum mlx5_rx_hash_fields {
+ MLX5_RX_HASH_SRC_IPV4 = 1 << 0,
+ MLX5_RX_HASH_DST_IPV4 = 1 << 1,
+ MLX5_RX_HASH_SRC_IPV6 = 1 << 2,
+ MLX5_RX_HASH_DST_IPV6 = 1 << 3,
+ MLX5_RX_HASH_SRC_PORT_TCP = 1 << 4,
+ MLX5_RX_HASH_DST_PORT_TCP = 1 << 5,
+ MLX5_RX_HASH_SRC_PORT_UDP = 1 << 6,
+ MLX5_RX_HASH_DST_PORT_UDP = 1 << 7
+};
+
+struct mlx5_ib_create_qp_rss {
+ __u64 rx_hash_fields_mask; /* enum mlx5_rx_hash_fields */
+ __u8 rx_hash_function; /* enum mlx5_rx_hash_function_flags */
+ __u8 rx_key_len; /* valid only for Toeplitz */
+ __u8 reserved[6];
+ __u8 rx_hash_key[128]; /* valid only for Toeplitz */
+ __u32 comp_mask;
+ __u32 reserved1;
+};
+
struct mlx5_ib_create_qp_resp {
__u32 uuar_index;
};
@@ -159,6 +217,32 @@ struct mlx5_ib_alloc_mw {
__u16 reserved2;
};
+struct mlx5_ib_create_wq {
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u32 rq_wqe_count;
+ __u32 rq_wqe_shift;
+ __u32 user_index;
+ __u32 flags;
+ __u32 comp_mask;
+ __u32 reserved;
+};
+
+struct mlx5_ib_create_wq_resp {
+ __u32 response_length;
+ __u32 reserved;
+};
+
+struct mlx5_ib_create_rwq_ind_tbl_resp {
+ __u32 response_length;
+ __u32 reserved;
+};
+
+struct mlx5_ib_modify_wq {
+ __u32 comp_mask;
+ __u32 reserved;
+};
+
static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext,
struct mlx5_ib_create_qp *ucmd,
int inlen,
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 9866c35cc977..da2335f7f7c3 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1081,16 +1081,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr,
return sprintf(buf, "%x\n", dev->rev_id);
}
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
- char *buf)
-{
- struct mthca_dev *dev =
- container_of(device, struct mthca_dev, ib_dev.dev);
- return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32),
- (int) (dev->fw_ver >> 16) & 0xffff,
- (int) dev->fw_ver & 0xffff);
-}
-
static ssize_t show_hca(struct device *device, struct device_attribute *attr,
char *buf)
{
@@ -1120,13 +1110,11 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr,
}
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
static struct device_attribute *mthca_dev_attributes[] = {
&dev_attr_hw_rev,
- &dev_attr_fw_ver,
&dev_attr_hca_type,
&dev_attr_board_id
};
@@ -1187,6 +1175,17 @@ static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num,
return 0;
}
+static void get_dev_fw_str(struct ib_device *device, char *str,
+ size_t str_len)
+{
+ struct mthca_dev *dev =
+ container_of(device, struct mthca_dev, ib_dev);
+ snprintf(str, str_len, "%d.%d.%d",
+ (int) (dev->fw_ver >> 32),
+ (int) (dev->fw_ver >> 16) & 0xffff,
+ (int) dev->fw_ver & 0xffff);
+}
+
int mthca_register_device(struct mthca_dev *dev)
{
int ret;
@@ -1266,6 +1265,7 @@ int mthca_register_device(struct mthca_dev *dev)
dev->ib_dev.reg_user_mr = mthca_reg_user_mr;
dev->ib_dev.dereg_mr = mthca_dereg_mr;
dev->ib_dev.get_port_immutable = mthca_port_immutable;
+ dev->ib_dev.get_dev_fw_str = get_dev_fw_str;
if (dev->mthca_flags & MTHCA_FLAG_FMR) {
dev->ib_dev.alloc_fmr = mthca_alloc_fmr;
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
index 74c6a9426047..6727af27c017 100644
--- a/drivers/infiniband/hw/mthca/mthca_reset.c
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -98,7 +98,7 @@ int mthca_reset(struct mthca_dev *mdev)
err = -ENOMEM;
mthca_err(mdev, "Couldn't allocate memory to save HCA "
"PCI header, aborting.\n");
- goto out;
+ goto put_dev;
}
for (i = 0; i < 64; ++i) {
@@ -108,7 +108,7 @@ int mthca_reset(struct mthca_dev *mdev)
err = -ENODEV;
mthca_err(mdev, "Couldn't save HCA "
"PCI header, aborting.\n");
- goto out;
+ goto free_hca;
}
}
@@ -121,7 +121,7 @@ int mthca_reset(struct mthca_dev *mdev)
err = -ENOMEM;
mthca_err(mdev, "Couldn't allocate memory to save HCA "
"bridge PCI header, aborting.\n");
- goto out;
+ goto free_hca;
}
for (i = 0; i < 64; ++i) {
@@ -131,7 +131,7 @@ int mthca_reset(struct mthca_dev *mdev)
err = -ENODEV;
mthca_err(mdev, "Couldn't save HCA bridge "
"PCI header, aborting.\n");
- goto out;
+ goto free_bh;
}
}
bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
@@ -139,7 +139,7 @@ int mthca_reset(struct mthca_dev *mdev)
err = -ENODEV;
mthca_err(mdev, "Couldn't locate HCA bridge "
"PCI-X capability, aborting.\n");
- goto out;
+ goto free_bh;
}
}
@@ -152,7 +152,7 @@ int mthca_reset(struct mthca_dev *mdev)
err = -ENOMEM;
mthca_err(mdev, "Couldn't map HCA reset register, "
"aborting.\n");
- goto out;
+ goto free_bh;
}
writel(MTHCA_RESET_VALUE, reset);
@@ -172,7 +172,7 @@ int mthca_reset(struct mthca_dev *mdev)
err = -ENODEV;
mthca_err(mdev, "Couldn't access HCA after reset, "
"aborting.\n");
- goto out;
+ goto free_bh;
}
if (v != 0xffffffff)
@@ -184,7 +184,7 @@ int mthca_reset(struct mthca_dev *mdev)
err = -ENODEV;
mthca_err(mdev, "PCI device did not come back after reset, "
"aborting.\n");
- goto out;
+ goto free_bh;
}
good:
@@ -195,14 +195,14 @@ good:
err = -ENODEV;
mthca_err(mdev, "Couldn't restore HCA bridge Upstream "
"split transaction control, aborting.\n");
- goto out;
+ goto free_bh;
}
if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc,
bridge_header[(bridge_pcix_cap + 0xc) / 4])) {
err = -ENODEV;
mthca_err(mdev, "Couldn't restore HCA bridge Downstream "
"split transaction control, aborting.\n");
- goto out;
+ goto free_bh;
}
/*
* Bridge control register is at 0x3e, so we'll
@@ -216,7 +216,7 @@ good:
err = -ENODEV;
mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
"aborting.\n", i);
- goto out;
+ goto free_bh;
}
}
@@ -225,7 +225,7 @@ good:
err = -ENODEV;
mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
"aborting.\n");
- goto out;
+ goto free_bh;
}
}
@@ -235,7 +235,7 @@ good:
err = -ENODEV;
mthca_err(mdev, "Couldn't restore HCA PCI-X "
"command register, aborting.\n");
- goto out;
+ goto free_bh;
}
}
@@ -246,7 +246,7 @@ good:
err = -ENODEV;
mthca_err(mdev, "Couldn't restore HCA PCI Express "
"Device Control register, aborting.\n");
- goto out;
+ goto free_bh;
}
linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4];
if (pcie_capability_write_word(mdev->pdev, PCI_EXP_LNKCTL,
@@ -254,7 +254,7 @@ good:
err = -ENODEV;
mthca_err(mdev, "Couldn't restore HCA PCI Express "
"Link control register, aborting.\n");
- goto out;
+ goto free_bh;
}
}
@@ -266,7 +266,7 @@ good:
err = -ENODEV;
mthca_err(mdev, "Couldn't restore HCA reg %x, "
"aborting.\n", i);
- goto out;
+ goto free_bh;
}
}
@@ -275,14 +275,12 @@ good:
err = -ENODEV;
mthca_err(mdev, "Couldn't restore HCA COMMAND, "
"aborting.\n");
- goto out;
}
-
-out:
- if (bridge)
- pci_dev_put(bridge);
+free_bh:
kfree(bridge_header);
+free_hca:
kfree(hca_header);
-
+put_dev:
+ pci_dev_put(bridge);
return err;
}
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 92914539edc7..2b27d1351cf7 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -356,7 +356,7 @@ static int nes_netdev_stop(struct net_device *netdev)
/**
* nes_nic_send
*/
-static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
+static bool nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
{
struct nes_vnic *nesvnic = netdev_priv(netdev);
struct nes_device *nesdev = nesvnic->nesdev;
@@ -413,7 +413,7 @@ static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb));
kfree_skb(skb);
nesvnic->tx_sw_dropped++;
- return NETDEV_TX_LOCKED;
+ return false;
}
set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE,
@@ -454,8 +454,7 @@ static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc);
nesnic->sq_head++;
nesnic->sq_head &= nesnic->sq_size - 1;
-
- return NETDEV_TX_OK;
+ return true;
}
@@ -479,7 +478,6 @@ static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
u32 tso_wqe_length;
u32 curr_tcp_seq;
u32 wqe_count=1;
- u32 send_rc;
struct iphdr *iph;
__le16 *wqe_fragment_length;
u32 nr_frags;
@@ -670,13 +668,11 @@ tso_sq_no_longer_full:
skb_linearize(skb);
skb_set_transport_header(skb, hoffset);
skb_set_network_header(skb, nhoffset);
- send_rc = nes_nic_send(skb, netdev);
- if (send_rc != NETDEV_TX_OK)
+ if (!nes_nic_send(skb, netdev))
return NETDEV_TX_OK;
}
} else {
- send_rc = nes_nic_send(skb, netdev);
- if (send_rc != NETDEV_TX_OK)
+ if (!nes_nic_send(skb, netdev))
return NETDEV_TX_OK;
}
@@ -686,7 +682,7 @@ tso_sq_no_longer_full:
nes_write32(nesdev->regs+NES_WQE_ALLOC,
(wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id);
- netdev->trans_start = jiffies;
+ netif_trans_update(netdev);
return NETDEV_TX_OK;
}
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
index 6d3a169c049b..37331e2fdc5f 100644
--- a/drivers/infiniband/hw/nes/nes_utils.c
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -44,6 +44,7 @@
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/init.h>
+#include <linux/kernel.h>
#include <asm/io.h>
#include <asm/irq.h>
@@ -903,70 +904,15 @@ void nes_clc(unsigned long parm)
*/
void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length)
{
- char xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
- 'a', 'b', 'c', 'd', 'e', 'f'};
- char *ptr;
- char hex_buf[80];
- char ascii_buf[20];
- int num_char;
- int num_ascii;
- int num_hex;
-
if (!(nes_debug_level & dump_debug_level)) {
return;
}
- ptr = addr;
if (length > 0x100) {
nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100);
length = 0x100;
}
- nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length);
-
- memset(ascii_buf, 0, 20);
- memset(hex_buf, 0, 80);
-
- num_ascii = 0;
- num_hex = 0;
- for (num_char = 0; num_char < length; num_char++) {
- if (num_ascii == 8) {
- ascii_buf[num_ascii++] = ' ';
- hex_buf[num_hex++] = '-';
- hex_buf[num_hex++] = ' ';
- }
-
- if (*ptr < 0x20 || *ptr > 0x7e)
- ascii_buf[num_ascii++] = '.';
- else
- ascii_buf[num_ascii++] = *ptr;
- hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)];
- hex_buf[num_hex++] = xlate[*ptr & 0x0f];
- hex_buf[num_hex++] = ' ';
- ptr++;
-
- if (num_ascii >= 17) {
- /* output line and reset */
- nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf);
- memset(ascii_buf, 0, 20);
- memset(hex_buf, 0, 80);
- num_ascii = 0;
- num_hex = 0;
- }
- }
+ nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", addr, length, length);
- /* output the rest */
- if (num_ascii) {
- while (num_ascii < 17) {
- if (num_ascii == 8) {
- hex_buf[num_hex++] = ' ';
- hex_buf[num_hex++] = ' ';
- }
- hex_buf[num_hex++] = ' ';
- hex_buf[num_hex++] = ' ';
- hex_buf[num_hex++] = ' ';
- num_ascii++;
- }
-
- nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf);
- }
+ print_hex_dump(KERN_ERR, PFX, DUMP_PREFIX_NONE, 16, 1, addr, length, true);
}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index fba69a39a7eb..bd69125731c1 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -402,15 +402,14 @@ static int nes_set_page(struct ib_mr *ibmr, u64 addr)
return 0;
}
-static int nes_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents)
+static int nes_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+ int sg_nents, unsigned int *sg_offset)
{
struct nes_mr *nesmr = to_nesmr(ibmr);
nesmr->npages = 0;
- return ib_sg_to_pages(ibmr, sg, sg_nents, nes_set_page);
+ return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, nes_set_page);
}
/**
@@ -981,7 +980,7 @@ static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic,
/**
* nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory.
*/
-static inline void nes_free_qp_mem(struct nes_device *nesdev,
+static void nes_free_qp_mem(struct nes_device *nesdev,
struct nes_qp *nesqp, int virt_wqs)
{
unsigned long flags;
@@ -1315,6 +1314,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type);
return ERR_PTR(-EINVAL);
}
+ init_completion(&nesqp->sq_drained);
+ init_completion(&nesqp->rq_drained);
nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR);
init_timer(&nesqp->terminate_timer);
@@ -2605,23 +2606,6 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
/**
- * show_fw_ver
- */
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct nes_ib_device *nesibdev =
- container_of(dev, struct nes_ib_device, ibdev.dev);
- struct nes_vnic *nesvnic = nesibdev->nesvnic;
-
- nes_debug(NES_DBG_INIT, "\n");
- return sprintf(buf, "%u.%u\n",
- (nesvnic->nesdev->nesadapter->firmware_version >> 16),
- (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
-}
-
-
-/**
* show_hca
*/
static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
@@ -2644,13 +2628,11 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr,
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
static struct device_attribute *nes_dev_attributes[] = {
&dev_attr_hw_rev,
- &dev_attr_fw_ver,
&dev_attr_hca_type,
&dev_attr_board_id
};
@@ -3452,6 +3434,29 @@ out:
return err;
}
+/**
+ * nes_drain_sq - drain sq
+ * @ibqp: pointer to ibqp
+ */
+static void nes_drain_sq(struct ib_qp *ibqp)
+{
+ struct nes_qp *nesqp = to_nesqp(ibqp);
+
+ if (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)
+ wait_for_completion(&nesqp->sq_drained);
+}
+
+/**
+ * nes_drain_rq - drain rq
+ * @ibqp: pointer to ibqp
+ */
+static void nes_drain_rq(struct ib_qp *ibqp)
+{
+ struct nes_qp *nesqp = to_nesqp(ibqp);
+
+ if (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)
+ wait_for_completion(&nesqp->rq_drained);
+}
/**
* nes_poll_cq
@@ -3582,6 +3587,13 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
}
}
+ if (nesqp->iwarp_state > NES_CQP_QP_IWARP_STATE_RTS) {
+ if (nesqp->hwqp.sq_tail == nesqp->hwqp.sq_head)
+ complete(&nesqp->sq_drained);
+ if (nesqp->hwqp.rq_tail == nesqp->hwqp.rq_head)
+ complete(&nesqp->rq_drained);
+ }
+
entry->wr_id = wrid;
entry++;
cqe_count++;
@@ -3672,6 +3684,19 @@ static int nes_port_immutable(struct ib_device *ibdev, u8 port_num,
return 0;
}
+static void get_dev_fw_str(struct ib_device *dev, char *str,
+ size_t str_len)
+{
+ struct nes_ib_device *nesibdev =
+ container_of(dev, struct nes_ib_device, ibdev);
+ struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+ nes_debug(NES_DBG_INIT, "\n");
+ snprintf(str, str_len, "%u.%u",
+ (nesvnic->nesdev->nesadapter->firmware_version >> 16),
+ (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
+}
+
/**
* nes_init_ofa_device
*/
@@ -3754,6 +3779,8 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
nesibdev->ibdev.req_notify_cq = nes_req_notify_cq;
nesibdev->ibdev.post_send = nes_post_send;
nesibdev->ibdev.post_recv = nes_post_recv;
+ nesibdev->ibdev.drain_sq = nes_drain_sq;
+ nesibdev->ibdev.drain_rq = nes_drain_rq;
nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL);
if (nesibdev->ibdev.iwcm == NULL) {
@@ -3769,6 +3796,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
nesibdev->ibdev.get_port_immutable = nes_port_immutable;
+ nesibdev->ibdev.get_dev_fw_str = get_dev_fw_str;
memcpy(nesibdev->ibdev.iwcm->ifname, netdev->name,
sizeof(nesibdev->ibdev.iwcm->ifname));
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
index 70290883d067..e02a5662dc20 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.h
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -189,6 +189,8 @@ struct nes_qp {
u8 pau_pending;
u8 pau_state;
__u64 nesuqp_addr;
+ struct completion sq_drained;
+ struct completion rq_drained;
};
struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
index 16740dcb876b..67fc0b6857e1 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -1156,18 +1156,18 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev,
attr->max_srq =
(rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >>
OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET;
- attr->max_send_sge = ((rsp->max_write_send_sge &
+ attr->max_send_sge = ((rsp->max_recv_send_sge &
OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >>
OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT);
- attr->max_recv_sge = (rsp->max_write_send_sge &
- OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >>
- OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT;
+ attr->max_recv_sge = (rsp->max_recv_send_sge &
+ OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_MASK) >>
+ OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_SHIFT;
attr->max_srq_sge = (rsp->max_srq_rqe_sge &
OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK) >>
OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET;
- attr->max_rdma_sge = (rsp->max_write_send_sge &
- OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK) >>
- OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT;
+ attr->max_rdma_sge = (rsp->max_wr_rd_sge &
+ OCRDMA_MBX_QUERY_CFG_MAX_RD_SGE_MASK) >>
+ OCRDMA_MBX_QUERY_CFG_MAX_RD_SGE_SHIFT;
attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp &
OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >>
OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 3d75f65ce87e..07d0c6c5b046 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -107,6 +107,14 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
return 0;
}
+static void get_dev_fw_str(struct ib_device *device, char *str,
+ size_t str_len)
+{
+ struct ocrdma_dev *dev = get_ocrdma_dev(device);
+
+ snprintf(str, str_len, "%s", &dev->attr.fw_ver[0]);
+}
+
static int ocrdma_register_device(struct ocrdma_dev *dev)
{
strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX);
@@ -193,6 +201,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
dev->ibdev.process_mad = ocrdma_process_mad;
dev->ibdev.get_port_immutable = ocrdma_port_immutable;
+ dev->ibdev.get_dev_fw_str = get_dev_fw_str;
if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
dev->ibdev.uverbs_cmd_mask |=
@@ -262,14 +271,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr,
return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
}
-static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
- char *buf)
-{
- struct ocrdma_dev *dev = dev_get_drvdata(device);
-
- return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]);
-}
-
static ssize_t show_hca_type(struct device *device,
struct device_attribute *attr, char *buf)
{
@@ -279,12 +280,10 @@ static ssize_t show_hca_type(struct device *device,
}
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
static struct device_attribute *ocrdma_attributes[] = {
&dev_attr_hw_rev,
- &dev_attr_fw_ver,
&dev_attr_hca_type
};
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
index 0efc9662c6d8..37df4481bb8f 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -554,9 +554,9 @@ enum {
OCRDMA_MBX_QUERY_CFG_L3_TYPE_MASK = 0x18,
OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT = 0,
OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK = 0xFFFF,
- OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT = 16,
- OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK = 0xFFFF <<
- OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT,
+ OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_SHIFT = 16,
+ OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_MASK = 0xFFFF <<
+ OCRDMA_MBX_QUERY_CFG_MAX_RECV_SGE_SHIFT,
OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT = 0,
OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK = 0xFFFF,
@@ -612,6 +612,8 @@ enum {
OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET = 0,
OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK = 0xFFFF <<
OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET,
+ OCRDMA_MBX_QUERY_CFG_MAX_RD_SGE_SHIFT = 0,
+ OCRDMA_MBX_QUERY_CFG_MAX_RD_SGE_MASK = 0xFFFF,
};
struct ocrdma_mbx_query_config {
@@ -619,7 +621,7 @@ struct ocrdma_mbx_query_config {
struct ocrdma_mbx_rsp rsp;
u32 qp_srq_cq_ird_ord;
u32 max_pd_ca_ack_delay;
- u32 max_write_send_sge;
+ u32 max_recv_send_sge;
u32 max_ird_ord_per_qp;
u32 max_shared_ird_ord;
u32 max_mr;
@@ -639,6 +641,8 @@ struct ocrdma_mbx_query_config {
u32 max_wqes_rqes_per_q;
u32 max_cq_cqes_per_cq;
u32 max_srq_rqe_sge;
+ u32 max_wr_rd_sge;
+ u32 ird_pgsz_num_pages;
};
struct ocrdma_fw_ver_rsp {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index a8496a18e20d..0aa854737e74 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -125,8 +125,8 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr,
IB_DEVICE_SYS_IMAGE_GUID |
IB_DEVICE_LOCAL_DMA_LKEY |
IB_DEVICE_MEM_MGT_EXTENSIONS;
- attr->max_sge = dev->attr.max_send_sge;
- attr->max_sge_rd = attr->max_sge;
+ attr->max_sge = min(dev->attr.max_send_sge, dev->attr.max_recv_sge);
+ attr->max_sge_rd = dev->attr.max_rdma_sge;
attr->max_cq = dev->attr.max_cq;
attr->max_cqe = dev->attr.max_cqe;
attr->max_mr = dev->attr.max_mr;
@@ -3081,13 +3081,12 @@ static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr)
return 0;
}
-int ocrdma_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents)
+int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset)
{
struct ocrdma_mr *mr = get_ocrdma_mr(ibmr);
mr->npages = 0;
- return ib_sg_to_pages(ibmr, sg, sg_nents, ocrdma_set_page);
+ return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, ocrdma_set_page);
}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index 8b517fd36779..704ef1e9271b 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -122,8 +122,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
enum ib_mr_type mr_type,
u32 max_num_sg);
-int ocrdma_map_mr_sg(struct ib_mr *ibmr,
- struct scatterlist *sg,
- int sg_nents);
+int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset);
#endif /* __OCRDMA_VERBS_H__ */
diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c
index 5e75b43c596b..5bad8e3b40bb 100644
--- a/drivers/infiniband/hw/qib/qib_debugfs.c
+++ b/drivers/infiniband/hw/qib/qib_debugfs.c
@@ -189,27 +189,32 @@ static int _ctx_stats_seq_show(struct seq_file *s, void *v)
DEBUGFS_FILE(ctx_stats)
static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(RCU)
{
struct qib_qp_iter *iter;
loff_t n = *pos;
- rcu_read_lock();
iter = qib_qp_iter_init(s->private);
+
+ /* stop calls rcu_read_unlock */
+ rcu_read_lock();
+
if (!iter)
return NULL;
- while (n--) {
+ do {
if (qib_qp_iter_next(iter)) {
kfree(iter);
return NULL;
}
- }
+ } while (n--);
return iter;
}
static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
loff_t *pos)
+ __must_hold(RCU)
{
struct qib_qp_iter *iter = iter_ptr;
@@ -224,6 +229,7 @@ static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
}
static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+ __releases(RCU)
{
rcu_read_unlock();
}
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 24f4a782e0f4..382466a90da7 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -824,10 +824,7 @@ static int mmap_piobufs(struct vm_area_struct *vma,
phys = dd->physaddr + piobufs;
#if defined(__powerpc__)
- /* There isn't a generic way to specify writethrough mappings */
- pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
- pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
- pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
#endif
/*
@@ -2181,6 +2178,11 @@ static ssize_t qib_write(struct file *fp, const char __user *data,
switch (cmd.type) {
case QIB_CMD_ASSIGN_CTXT:
+ if (rcd) {
+ ret = -EINVAL;
+ goto bail;
+ }
+
ret = qib_assign_ctxt(fp, &cmd.cmd.user_info);
if (ret)
goto bail;
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index fcdf37913a26..c3edc033f7c4 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -328,26 +328,12 @@ static ssize_t flash_write(struct file *file, const char __user *buf,
pos = *ppos;
- if (pos != 0) {
- ret = -EINVAL;
- goto bail;
- }
-
- if (count != sizeof(struct qib_flash)) {
- ret = -EINVAL;
- goto bail;
- }
-
- tmp = kmalloc(count, GFP_KERNEL);
- if (!tmp) {
- ret = -ENOMEM;
- goto bail;
- }
+ if (pos != 0 || count != sizeof(struct qib_flash))
+ return -EINVAL;
- if (copy_from_user(tmp, buf, count)) {
- ret = -EFAULT;
- goto bail_tmp;
- }
+ tmp = memdup_user(buf, count);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
dd = private2dd(file);
if (qib_eeprom_write(dd, pos, tmp, count)) {
@@ -361,8 +347,6 @@ static ssize_t flash_write(struct file *file, const char __user *buf,
bail_tmp:
kfree(tmp);
-
-bail:
return ret;
}
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index 82d7c4bf5970..ce4034071f9c 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -1308,21 +1308,6 @@ static const struct qib_hwerror_msgs qib_7322p_error_msgs[] = {
SYM_LSB(IntMask, fldname##17IntMask)), \
.msg = #fldname "_C", .sz = sizeof(#fldname "_C") }
-static const struct qib_hwerror_msgs qib_7322_intr_msgs[] = {
- INTR_AUTO_P(SDmaInt),
- INTR_AUTO_P(SDmaProgressInt),
- INTR_AUTO_P(SDmaIdleInt),
- INTR_AUTO_P(SDmaCleanupDone),
- INTR_AUTO_C(RcvUrg),
- INTR_AUTO_P(ErrInt),
- INTR_AUTO(ErrInt), /* non-port-specific errs */
- INTR_AUTO(AssertGPIOInt),
- INTR_AUTO_P(SendDoneInt),
- INTR_AUTO(SendBufAvailInt),
- INTR_AUTO_C(RcvAvail),
- { .mask = 0, .sz = 0 }
-};
-
#define TXSYMPTOM_AUTO_P(fldname) \
{ .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \
.msg = #fldname, .sz = sizeof(#fldname) }
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
index 3f062f0dd9d8..f253111e682e 100644
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -1090,7 +1090,7 @@ void qib_free_devdata(struct qib_devdata *dd)
qib_dbg_ibdev_exit(&dd->verbs_dev);
#endif
free_percpu(dd->int_counter);
- ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+ rvt_dealloc_device(&dd->verbs_dev.rdi);
}
u64 qib_int_counter(struct qib_devdata *dd)
@@ -1183,7 +1183,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
bail:
if (!list_empty(&dd->list))
list_del_init(&dd->list);
- ib_dealloc_device(&dd->verbs_dev.rdi.ibdev);
+ rvt_dealloc_device(&dd->verbs_dev.rdi);
return ERR_PTR(ret);
}
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
index 0bd18375d7df..d2ac29861af5 100644
--- a/drivers/infiniband/hw/qib/qib_mad.c
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -1172,11 +1172,13 @@ static int pma_get_classportinfo(struct ib_pma_mad *pmp,
* Set the most significant bit of CM2 to indicate support for
* congestion statistics
*/
- p->reserved[0] = dd->psxmitwait_supported << 7;
+ ib_set_cpi_capmask2(p,
+ dd->psxmitwait_supported <<
+ (31 - IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE));
/*
* Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
*/
- p->resp_time_value = 18;
+ ib_set_cpi_resp_time(p, 18);
return reply((struct ib_smp *) pmp);
}
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c
index 4758a3801ae8..6abe1c621aa4 100644
--- a/drivers/infiniband/hw/qib/qib_pcie.c
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -144,13 +144,7 @@ int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev,
addr = pci_resource_start(pdev, 0);
len = pci_resource_len(pdev, 0);
-#if defined(__powerpc__)
- /* There isn't a generic way to specify writethrough mappings */
- dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU);
-#else
dd->kregbase = ioremap_nocache(addr, len);
-#endif
-
if (!dd->kregbase)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
index 575b737d9ef3..f9b8cd2354d1 100644
--- a/drivers/infiniband/hw/qib/qib_qp.c
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -106,6 +106,49 @@ static u32 credit_table[31] = {
32768 /* 1E */
};
+const struct rvt_operation_params qib_post_parms[RVT_OPERATION_MAX] = {
+[IB_WR_RDMA_WRITE] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_RDMA_READ] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_ATOMIC,
+},
+
+[IB_WR_ATOMIC_CMP_AND_SWP] = {
+ .length = sizeof(struct ib_atomic_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_ATOMIC_FETCH_AND_ADD] = {
+ .length = sizeof(struct ib_atomic_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_RDMA_WRITE_WITH_IMM] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND] = {
+ .length = sizeof(struct ib_send_wr),
+ .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+ BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND_WITH_IMM] = {
+ .length = sizeof(struct ib_send_wr),
+ .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+ BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+};
+
static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map,
gfp_t gfp)
{
@@ -530,10 +573,6 @@ struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev)
return NULL;
iter->dev = dev;
- if (qib_qp_iter_next(iter)) {
- kfree(iter);
- return NULL;
- }
return iter;
}
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index 9088e26d3ac8..444028a3582a 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -230,7 +230,7 @@ bail:
*
* Return 1 if constructed; otherwise, return 0.
*/
-int qib_make_rc_req(struct rvt_qp *qp)
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags)
{
struct qib_qp_priv *priv = qp->priv;
struct qib_ibdev *dev = to_idev(qp->ibqp.device);
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index a5f07a64b228..b67779256297 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -739,7 +739,7 @@ void qib_do_send(struct rvt_qp *qp)
struct qib_qp_priv *priv = qp->priv;
struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
struct qib_pportdata *ppd = ppd_from_ibp(ibp);
- int (*make_req)(struct rvt_qp *qp);
+ int (*make_req)(struct rvt_qp *qp, unsigned long *flags);
unsigned long flags;
if ((qp->ibqp.qp_type == IB_QPT_RC ||
@@ -781,7 +781,7 @@ void qib_do_send(struct rvt_qp *qp)
qp->s_hdrwords = 0;
spin_lock_irqsave(&qp->s_lock, flags);
}
- } while (make_req(qp));
+ } while (make_req(qp, &flags));
spin_unlock_irqrestore(&qp->s_lock, flags);
}
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
index 7bdbc79ceaa3..1d61bd04f449 100644
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -45,7 +45,7 @@
*
* Return 1 if constructed; otherwise, return 0.
*/
-int qib_make_uc_req(struct rvt_qp *qp)
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags)
{
struct qib_qp_priv *priv = qp->priv;
struct qib_other_headers *ohdr;
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index d9502137de62..10d062561bd9 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -169,8 +169,12 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
}
if (ah_attr->ah_flags & IB_AH_GRH) {
- qib_copy_sge(&qp->r_sge, &ah_attr->grh,
- sizeof(struct ib_grh), 1);
+ struct ib_grh grh;
+ struct ib_global_route grd = ah_attr->grh;
+
+ qib_make_grh(ibp, &grh, &grd, 0, 0);
+ qib_copy_sge(&qp->r_sge, &grh,
+ sizeof(grh), 1);
wc.wc_flags |= IB_WC_GRH;
} else
qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
@@ -238,7 +242,7 @@ drop:
*
* Return 1 if constructed; otherwise, return 0.
*/
-int qib_make_ud_req(struct rvt_qp *qp)
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
{
struct qib_qp_priv *priv = qp->priv;
struct qib_other_headers *ohdr;
@@ -294,7 +298,7 @@ int qib_make_ud_req(struct rvt_qp *qp)
this_cpu_inc(ibp->pmastats->n_unicast_xmit);
lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
if (unlikely(lid == ppd->lid)) {
- unsigned long flags;
+ unsigned long tflags = *flags;
/*
* If DMAs are in progress, we can't generate
* a completion for the loopback packet since
@@ -307,10 +311,10 @@ int qib_make_ud_req(struct rvt_qp *qp)
goto bail;
}
qp->s_cur = next_cur;
- local_irq_save(flags);
- spin_unlock_irqrestore(&qp->s_lock, flags);
+ spin_unlock_irqrestore(&qp->s_lock, tflags);
qib_ud_loopback(qp, wqe);
- spin_lock_irqsave(&qp->s_lock, flags);
+ spin_lock_irqsave(&qp->s_lock, tflags);
+ *flags = tflags;
qib_send_complete(qp, wqe, IB_WC_SUCCESS);
goto done;
}
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index cbf6200e6afc..fd1dfbce5539 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1582,6 +1582,8 @@ static void qib_fill_device_attr(struct qib_devdata *dd)
rdi->dparms.props.max_total_mcast_qp_attach =
rdi->dparms.props.max_mcast_qp_attach *
rdi->dparms.props.max_mcast_grp;
+ /* post send table */
+ dd->verbs_dev.rdi.post_parms = qib_post_parms;
}
/**
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index 4b76a8d59337..736ced684842 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -159,6 +159,7 @@ struct qib_other_headers {
} at;
__be32 imm_data;
__be32 aeth;
+ __be32 ieth;
struct ib_atomic_eth atomic_eth;
} u;
} __packed;
@@ -430,11 +431,11 @@ void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
void qib_send_rc_ack(struct rvt_qp *qp);
-int qib_make_rc_req(struct rvt_qp *qp);
+int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags);
-int qib_make_uc_req(struct rvt_qp *qp);
+int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags);
-int qib_make_ud_req(struct rvt_qp *qp);
+int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags);
int qib_register_ib_device(struct qib_devdata *);
@@ -496,4 +497,6 @@ extern unsigned int ib_qib_max_srq_wrs;
extern const u32 ib_qib_rnr_table[];
+extern const struct rvt_operation_params qib_post_parms[];
+
#endif /* QIB_VERBS_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index 565c881a44ba..0a89a955550b 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -331,6 +331,21 @@ static int usnic_port_immutable(struct ib_device *ibdev, u8 port_num,
return 0;
}
+static void usnic_get_dev_fw_str(struct ib_device *device,
+ char *str,
+ size_t str_len)
+{
+ struct usnic_ib_dev *us_ibdev =
+ container_of(device, struct usnic_ib_dev, ib_dev);
+ struct ethtool_drvinfo info;
+
+ mutex_lock(&us_ibdev->usdev_lock);
+ us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
+ mutex_unlock(&us_ibdev->usdev_lock);
+
+ snprintf(str, str_len, "%s", info.fw_version);
+}
+
/* Start of PF discovery section */
static void *usnic_ib_device_add(struct pci_dev *dev)
{
@@ -414,6 +429,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev)
us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq;
us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr;
us_ibdev->ib_dev.get_port_immutable = usnic_port_immutable;
+ us_ibdev->ib_dev.get_dev_fw_str = usnic_get_dev_fw_str;
if (ib_register_device(&us_ibdev->ib_dev, NULL))
@@ -648,7 +664,8 @@ static int __init usnic_ib_init(void)
return err;
}
- if (pci_register_driver(&usnic_ib_pci_driver)) {
+ err = pci_register_driver(&usnic_ib_pci_driver);
+ if (err) {
usnic_err("Unable to register with PCI\n");
goto out_umem_fini;
}
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
index 3412ea06116e..80ef3f8998c8 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -45,21 +45,6 @@
#include "usnic_ib_verbs.h"
#include "usnic_log.h"
-static ssize_t usnic_ib_show_fw_ver(struct device *device,
- struct device_attribute *attr,
- char *buf)
-{
- struct usnic_ib_dev *us_ibdev =
- container_of(device, struct usnic_ib_dev, ib_dev.dev);
- struct ethtool_drvinfo info;
-
- mutex_lock(&us_ibdev->usdev_lock);
- us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
- mutex_unlock(&us_ibdev->usdev_lock);
-
- return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version);
-}
-
static ssize_t usnic_ib_show_board(struct device *device,
struct device_attribute *attr,
char *buf)
@@ -192,7 +177,6 @@ usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr,
us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]);
}
-static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL);
static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL);
static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL);
@@ -201,7 +185,6 @@ static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL);
static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL);
static struct device_attribute *usnic_class_attributes[] = {
- &dev_attr_fw_ver,
&dev_attr_board_id,
&dev_attr_config,
&dev_attr_iface,
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 7209fbc03ccb..a0b6ebee4d8a 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -36,7 +36,6 @@
#include <linux/dma-mapping.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
-#include <linux/dma-attrs.h>
#include <linux/iommu.h>
#include <linux/workqueue.h>
#include <linux/list.h>
@@ -112,10 +111,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
int i;
int flags;
dma_addr_t pa;
- DEFINE_DMA_ATTRS(attrs);
-
- if (dmasync)
- dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
if (!can_do_mlock())
return -EPERM;
diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile
index 988b6a0101a4..8b095b27db87 100644
--- a/drivers/infiniband/sw/Makefile
+++ b/drivers/infiniband/sw/Makefile
@@ -1 +1,2 @@
obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/
+obj-$(CONFIG_RDMA_RXE) += rxe/
diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig
index 11aa6a34bd71..1da8d01a6855 100644
--- a/drivers/infiniband/sw/rdmavt/Kconfig
+++ b/drivers/infiniband/sw/rdmavt/Kconfig
@@ -1,6 +1,5 @@
config INFINIBAND_RDMAVT
tristate "RDMA verbs transport library"
depends on 64BIT
- default m
---help---
This is a common software verbs provider for RDMA networks.
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index b1ffc8b4a6c0..f2f229efbe64 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -510,6 +510,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
if (rdi->worker)
return 0;
+ spin_lock_init(&rdi->n_cqs_lock);
rdi->worker = kzalloc(sizeof(*rdi->worker), GFP_KERNEL);
if (!rdi->worker)
return -ENOMEM;
@@ -525,6 +526,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
return PTR_ERR(task);
}
+ set_user_nice(task, MIN_NICE);
cpu = cpumask_first(cpumask_of_node(rdi->dparms.node));
kthread_bind(task, cpu);
wake_up_process(task);
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 0ff765bfd619..46b64970058e 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -124,11 +124,13 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
int count)
{
int m, i = 0;
+ struct rvt_dev_info *dev = ib_to_rvt(pd->device);
mr->mapsz = 0;
m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
for (; i < m; i++) {
- mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL);
+ mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL,
+ dev->dparms.node);
if (!mr->map[i]) {
rvt_deinit_mregion(mr);
return -ENOMEM;
@@ -138,6 +140,7 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
init_completion(&mr->comp);
/* count returning the ptr to user */
atomic_set(&mr->refcount, 1);
+ atomic_set(&mr->lkey_invalid, 0);
mr->pd = pd;
mr->max_segs = count;
return 0;
@@ -291,7 +294,7 @@ static void __rvt_free_mr(struct rvt_mr *mr)
{
rvt_deinit_mregion(&mr->mr);
rvt_free_lkey(&mr->mr);
- vfree(mr);
+ kfree(mr);
}
/**
@@ -478,6 +481,123 @@ struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
}
/**
+ * rvt_set_page - page assignment function called by ib_sg_to_pages
+ * @ibmr: memory region
+ * @addr: dma address of mapped page
+ *
+ * Return: 0 on success
+ */
+static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
+{
+ struct rvt_mr *mr = to_imr(ibmr);
+ u32 ps = 1 << mr->mr.page_shift;
+ u32 mapped_segs = mr->mr.length >> mr->mr.page_shift;
+ int m, n;
+
+ if (unlikely(mapped_segs == mr->mr.max_segs))
+ return -ENOMEM;
+
+ if (mr->mr.length == 0) {
+ mr->mr.user_base = addr;
+ mr->mr.iova = addr;
+ }
+
+ m = mapped_segs / RVT_SEGSZ;
+ n = mapped_segs % RVT_SEGSZ;
+ mr->mr.map[m]->segs[n].vaddr = (void *)addr;
+ mr->mr.map[m]->segs[n].length = ps;
+ mr->mr.length += ps;
+
+ return 0;
+}
+
+/**
+ * rvt_map_mr_sg - map sg list and set it the memory region
+ * @ibmr: memory region
+ * @sg: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset: offset in bytes into sg
+ *
+ * Return: number of sg elements mapped to the memory region
+ */
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+ int sg_nents, unsigned int *sg_offset)
+{
+ struct rvt_mr *mr = to_imr(ibmr);
+
+ mr->mr.length = 0;
+ mr->mr.page_shift = PAGE_SHIFT;
+ return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
+ rvt_set_page);
+}
+
+/**
+ * rvt_fast_reg_mr - fast register physical MR
+ * @qp: the queue pair where the work request comes from
+ * @ibmr: the memory region to be registered
+ * @key: updated key for this memory region
+ * @access: access flags for this memory region
+ *
+ * Returns 0 on success.
+ */
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+ int access)
+{
+ struct rvt_mr *mr = to_imr(ibmr);
+
+ if (qp->ibqp.pd != mr->mr.pd)
+ return -EACCES;
+
+ /* not applicable to dma MR or user MR */
+ if (!mr->mr.lkey || mr->umem)
+ return -EINVAL;
+
+ if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00))
+ return -EINVAL;
+
+ ibmr->lkey = key;
+ ibmr->rkey = key;
+ mr->mr.lkey = key;
+ mr->mr.access_flags = access;
+ atomic_set(&mr->mr.lkey_invalid, 0);
+
+ return 0;
+}
+EXPORT_SYMBOL(rvt_fast_reg_mr);
+
+/**
+ * rvt_invalidate_rkey - invalidate an MR rkey
+ * @qp: queue pair associated with the invalidate op
+ * @rkey: rkey to invalidate
+ *
+ * Returns 0 on success.
+ */
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey)
+{
+ struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
+ struct rvt_lkey_table *rkt = &dev->lkey_table;
+ struct rvt_mregion *mr;
+
+ if (rkey == 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ mr = rcu_dereference(
+ rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
+ if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+ goto bail;
+
+ atomic_set(&mr->lkey_invalid, 1);
+ rcu_read_unlock();
+ return 0;
+
+bail:
+ rcu_read_unlock();
+ return -EINVAL;
+}
+EXPORT_SYMBOL(rvt_invalidate_rkey);
+
+/**
* rvt_alloc_fmr - allocate a fast memory region
* @pd: the protection domain for this memory region
* @mr_access_flags: access flags for this memory region
@@ -680,7 +800,8 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
}
mr = rcu_dereference(
rkt->table[(sge->lkey >> (32 - dev->dparms.lkey_table_size))]);
- if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+ if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
+ mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
goto bail;
off = sge->addr - mr->user_base;
@@ -780,7 +901,8 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
mr = rcu_dereference(
rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
- if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+ if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
+ mr->lkey != rkey || qp->ibqp.pd != mr->pd))
goto bail;
off = vaddr - mr->iova;
diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h
index 69380512c6d1..132800ee0205 100644
--- a/drivers/infiniband/sw/rdmavt/mr.h
+++ b/drivers/infiniband/sw/rdmavt/mr.h
@@ -82,6 +82,8 @@ int rvt_dereg_mr(struct ib_mr *ibmr);
struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
enum ib_mr_type mr_type,
u32 max_num_sg);
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+ int sg_nents, unsigned int *sg_offset);
struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
struct ib_fmr_attr *fmr_attr);
int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index a9e3bcc522c4..870b4f212fbc 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -369,8 +369,8 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
/* wrap to first map page, invert bit 0 */
offset = qpt->incr | ((offset & 1) ^ 1);
}
- /* there can be no bits at shift and below */
- WARN_ON(offset & (rdi->dparms.qos_shift - 1));
+ /* there can be no set bits in low-order QoS bits */
+ WARN_ON(offset & (BIT(rdi->dparms.qos_shift) - 1));
qpn = mk_qpn(qpt, map, offset);
}
@@ -397,6 +397,7 @@ static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
{
unsigned n;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
rvt_put_ss(&qp->s_rdma_read_sge);
@@ -431,11 +432,10 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
if (qp->ibqp.qp_type != IB_QPT_RC)
return;
- for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+ for (n = 0; n < rvt_max_atomic(rdi); n++) {
struct rvt_ack_entry *e = &qp->s_ack_queue[n];
- if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
- e->rdma_sge.mr) {
+ if (e->rdma_sge.mr) {
rvt_put_mr(e->rdma_sge.mr);
e->rdma_sge.mr = NULL;
}
@@ -501,6 +501,12 @@ static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
*/
static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
enum ib_qp_type type)
+ __releases(&qp->s_lock)
+ __releases(&qp->s_hlock)
+ __releases(&qp->r_lock)
+ __acquires(&qp->r_lock)
+ __acquires(&qp->s_hlock)
+ __acquires(&qp->s_lock)
{
if (qp->state != IB_QPS_RESET) {
qp->state = IB_QPS_RESET;
@@ -569,7 +575,6 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
qp->s_ssn = 1;
qp->s_lsn = 0;
qp->s_mig_state = IB_MIG_MIGRATED;
- memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
qp->r_head_ack_queue = 0;
qp->s_tail_ack_queue = 0;
qp->s_num_rd_atomic = 0;
@@ -578,6 +583,7 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
qp->r_rq.wq->tail = 0;
}
qp->r_sge.num_sge = 0;
+ atomic_set(&qp->s_reserved_used, 0);
}
/**
@@ -607,6 +613,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
void *priv = NULL;
gfp_t gfp;
+ size_t sqsize;
if (!rdi)
return ERR_PTR(-EINVAL);
@@ -637,7 +644,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
init_attr->cap.max_recv_wr == 0)
return ERR_PTR(-EINVAL);
}
-
+ sqsize =
+ init_attr->cap.max_send_wr + 1 +
+ rdi->dparms.reserved_operations;
switch (init_attr->qp_type) {
case IB_QPT_SMI:
case IB_QPT_GSI:
@@ -652,11 +661,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
sizeof(struct rvt_swqe);
if (gfp == GFP_NOIO)
swq = __vmalloc(
- (init_attr->cap.max_send_wr + 1) * sz,
- gfp, PAGE_KERNEL);
+ sqsize * sz,
+ gfp | __GFP_ZERO, PAGE_KERNEL);
else
- swq = vmalloc_node(
- (init_attr->cap.max_send_wr + 1) * sz,
+ swq = vzalloc_node(
+ sqsize * sz,
rdi->dparms.node);
if (!swq)
return ERR_PTR(-ENOMEM);
@@ -677,14 +686,26 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
goto bail_swq;
RCU_INIT_POINTER(qp->next, NULL);
+ if (init_attr->qp_type == IB_QPT_RC) {
+ qp->s_ack_queue =
+ kzalloc_node(
+ sizeof(*qp->s_ack_queue) *
+ rvt_max_atomic(rdi),
+ gfp,
+ rdi->dparms.node);
+ if (!qp->s_ack_queue)
+ goto bail_qp;
+ }
/*
* Driver needs to set up it's private QP structure and do any
* initialization that is needed.
*/
priv = rdi->driver_f.qp_priv_alloc(rdi, qp, gfp);
- if (!priv)
+ if (IS_ERR(priv)) {
+ ret = priv;
goto bail_qp;
+ }
qp->priv = priv;
qp->timeout_jiffies =
usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
@@ -704,9 +725,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
qp->r_rq.wq = __vmalloc(
sizeof(struct rvt_rwq) +
qp->r_rq.size * sz,
- gfp, PAGE_KERNEL);
+ gfp | __GFP_ZERO, PAGE_KERNEL);
else
- qp->r_rq.wq = vmalloc_node(
+ qp->r_rq.wq = vzalloc_node(
sizeof(struct rvt_rwq) +
qp->r_rq.size * sz,
rdi->dparms.node);
@@ -723,13 +744,14 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
spin_lock_init(&qp->s_lock);
spin_lock_init(&qp->r_rq.lock);
atomic_set(&qp->refcount, 0);
+ atomic_set(&qp->local_ops_pending, 0);
init_waitqueue_head(&qp->wait);
init_timer(&qp->s_timer);
qp->s_timer.data = (unsigned long)qp;
INIT_LIST_HEAD(&qp->rspwait);
qp->state = IB_QPS_RESET;
qp->s_wq = swq;
- qp->s_size = init_attr->cap.max_send_wr + 1;
+ qp->s_size = sqsize;
qp->s_avail = init_attr->cap.max_send_wr;
qp->s_max_sge = init_attr->cap.max_send_sge;
if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
@@ -829,13 +851,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
case IB_QPT_SMI:
case IB_QPT_GSI:
case IB_QPT_UD:
- qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & RVT_OPCODE_QP_MASK;
+ qp->allowed_ops = IB_OPCODE_UD;
break;
case IB_QPT_RC:
- qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & RVT_OPCODE_QP_MASK;
+ qp->allowed_ops = IB_OPCODE_RC;
break;
case IB_QPT_UC:
- qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & RVT_OPCODE_QP_MASK;
+ qp->allowed_ops = IB_OPCODE_UC;
break;
default:
ret = ERR_PTR(-EINVAL);
@@ -851,12 +873,14 @@ bail_qpn:
free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
bail_rq_wq:
- vfree(qp->r_rq.wq);
+ if (!qp->ip)
+ vfree(qp->r_rq.wq);
bail_driver_priv:
rdi->driver_f.qp_priv_free(rdi, qp);
bail_qp:
+ kfree(qp->s_ack_queue);
kfree(qp);
bail_swq:
@@ -1284,6 +1308,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
vfree(qp->r_rq.wq);
vfree(qp->s_wq);
rdi->driver_f.qp_priv_free(rdi, qp);
+ kfree(qp->s_ack_queue);
kfree(qp);
return 0;
}
@@ -1312,7 +1337,8 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
attr->dest_qp_num = qp->remote_qpn;
attr->qp_access_flags = qp->qp_access_flags;
- attr->cap.max_send_wr = qp->s_size - 1;
+ attr->cap.max_send_wr = qp->s_size - 1 -
+ rdi->dparms.reserved_operations;
attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
attr->cap.max_send_sge = qp->s_max_sge;
attr->cap.max_recv_sge = qp->r_rq.max_sge;
@@ -1420,25 +1446,116 @@ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
}
/**
- * qp_get_savail - return number of avail send entries
+ * rvt_qp_valid_operation - validate post send wr request
+ * @qp - the qp
+ * @post-parms - the post send table for the driver
+ * @wr - the work request
+ *
+ * The routine validates the operation based on the
+ * validation table an returns the length of the operation
+ * which can extend beyond the ib_send_bw. Operation
+ * dependent flags key atomic operation validation.
*
+ * There is an exception for UD qps that validates the pd and
+ * overrides the length to include the additional UD specific
+ * length.
+ *
+ * Returns a negative error or the length of the work request
+ * for building the swqe.
+ */
+static inline int rvt_qp_valid_operation(
+ struct rvt_qp *qp,
+ const struct rvt_operation_params *post_parms,
+ struct ib_send_wr *wr)
+{
+ int len;
+
+ if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length)
+ return -EINVAL;
+ if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
+ return -EINVAL;
+ if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
+ ibpd_to_rvtpd(qp->ibqp.pd)->user)
+ return -EINVAL;
+ if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
+ (wr->num_sge == 0 ||
+ wr->sg_list[0].length < sizeof(u64) ||
+ wr->sg_list[0].addr & (sizeof(u64) - 1)))
+ return -EINVAL;
+ if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
+ !qp->s_max_rd_atomic)
+ return -EINVAL;
+ len = post_parms[wr->opcode].length;
+ /* UD specific */
+ if (qp->ibqp.qp_type != IB_QPT_UC &&
+ qp->ibqp.qp_type != IB_QPT_RC) {
+ if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
+ return -EINVAL;
+ len = sizeof(struct ib_ud_wr);
+ }
+ return len;
+}
+
+/**
+ * rvt_qp_is_avail - determine queue capacity
* @qp - the qp
+ * @rdi - the rdmavt device
+ * @reserved_op - is reserved operation
*
* This assumes the s_hlock is held but the s_last
* qp variable is uncontrolled.
+ *
+ * For non reserved operations, the qp->s_avail
+ * may be changed.
+ *
+ * The return value is zero or a -ENOMEM.
*/
-static inline u32 qp_get_savail(struct rvt_qp *qp)
+static inline int rvt_qp_is_avail(
+ struct rvt_qp *qp,
+ struct rvt_dev_info *rdi,
+ bool reserved_op)
{
u32 slast;
- u32 ret;
-
+ u32 avail;
+ u32 reserved_used;
+
+ /* see rvt_qp_wqe_unreserve() */
+ smp_mb__before_atomic();
+ reserved_used = atomic_read(&qp->s_reserved_used);
+ if (unlikely(reserved_op)) {
+ /* see rvt_qp_wqe_unreserve() */
+ smp_mb__before_atomic();
+ if (reserved_used >= rdi->dparms.reserved_operations)
+ return -ENOMEM;
+ return 0;
+ }
+ /* non-reserved operations */
+ if (likely(qp->s_avail))
+ return 0;
smp_read_barrier_depends(); /* see rc.c */
slast = ACCESS_ONCE(qp->s_last);
if (qp->s_head >= slast)
- ret = qp->s_size - (qp->s_head - slast);
+ avail = qp->s_size - (qp->s_head - slast);
else
- ret = slast - qp->s_head;
- return ret - 1;
+ avail = slast - qp->s_head;
+
+ /* see rvt_qp_wqe_unreserve() */
+ smp_mb__before_atomic();
+ reserved_used = atomic_read(&qp->s_reserved_used);
+ avail = avail - 1 -
+ (rdi->dparms.reserved_operations - reserved_used);
+ /* insure we don't assign a negative s_avail */
+ if ((s32)avail <= 0)
+ return -ENOMEM;
+ qp->s_avail = avail;
+ if (WARN_ON(qp->s_avail >
+ (qp->s_size - 1 - rdi->dparms.reserved_operations)))
+ rvt_pr_err(rdi,
+ "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
+ qp->ibqp.qp_num, qp->s_size, qp->s_avail,
+ qp->s_head, qp->s_tail, qp->s_cur,
+ qp->s_acked, qp->s_last);
+ return 0;
}
/**
@@ -1460,49 +1577,64 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
u8 log_pmtu;
int ret;
+ size_t cplen;
+ bool reserved_op;
+ int local_ops_delayed = 0;
+
+ BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
/* IB spec says that num_sge == 0 is OK. */
if (unlikely(wr->num_sge > qp->s_max_sge))
return -EINVAL;
+ ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr);
+ if (ret < 0)
+ return ret;
+ cplen = ret;
+
/*
- * Don't allow RDMA reads or atomic operations on UC or
- * undefined operations.
- * Make sure buffer is large enough to hold the result for atomics.
+ * Local operations include fast register and local invalidate.
+ * Fast register needs to be processed immediately because the
+ * registered lkey may be used by following work requests and the
+ * lkey needs to be valid at the time those requests are posted.
+ * Local invalidate can be processed immediately if fencing is
+ * not required and no previous local invalidate ops are pending.
+ * Signaled local operations that have been processed immediately
+ * need to have requests with "completion only" flags set posted
+ * to the send queue in order to generate completions.
*/
- if (qp->ibqp.qp_type == IB_QPT_UC) {
- if ((unsigned)wr->opcode >= IB_WR_RDMA_READ)
- return -EINVAL;
- } else if (qp->ibqp.qp_type != IB_QPT_RC) {
- /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
- if (wr->opcode != IB_WR_SEND &&
- wr->opcode != IB_WR_SEND_WITH_IMM)
- return -EINVAL;
- /* Check UD destination address PD */
- if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
+ if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
+ switch (wr->opcode) {
+ case IB_WR_REG_MR:
+ ret = rvt_fast_reg_mr(qp,
+ reg_wr(wr)->mr,
+ reg_wr(wr)->key,
+ reg_wr(wr)->access);
+ if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+ return ret;
+ break;
+ case IB_WR_LOCAL_INV:
+ if ((wr->send_flags & IB_SEND_FENCE) ||
+ atomic_read(&qp->local_ops_pending)) {
+ local_ops_delayed = 1;
+ } else {
+ ret = rvt_invalidate_rkey(
+ qp, wr->ex.invalidate_rkey);
+ if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+ return ret;
+ }
+ break;
+ default:
return -EINVAL;
- } else if ((unsigned)wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) {
- return -EINVAL;
- } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
- (wr->num_sge == 0 ||
- wr->sg_list[0].length < sizeof(u64) ||
- wr->sg_list[0].addr & (sizeof(u64) - 1))) {
- return -EINVAL;
- } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) {
- return -EINVAL;
+ }
}
+
+ reserved_op = rdi->post_parms[wr->opcode].flags &
+ RVT_OPERATION_USE_RESERVE;
/* check for avail */
- if (unlikely(!qp->s_avail)) {
- qp->s_avail = qp_get_savail(qp);
- if (WARN_ON(qp->s_avail > (qp->s_size - 1)))
- rvt_pr_err(rdi,
- "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
- qp->ibqp.qp_num, qp->s_size, qp->s_avail,
- qp->s_head, qp->s_tail, qp->s_cur,
- qp->s_acked, qp->s_last);
- if (!qp->s_avail)
- return -ENOMEM;
- }
+ ret = rvt_qp_is_avail(qp, rdi, reserved_op);
+ if (ret)
+ return ret;
next = qp->s_head + 1;
if (next >= qp->s_size)
next = 0;
@@ -1511,18 +1643,8 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
pd = ibpd_to_rvtpd(qp->ibqp.pd);
wqe = rvt_get_swqe_ptr(qp, qp->s_head);
- if (qp->ibqp.qp_type != IB_QPT_UC &&
- qp->ibqp.qp_type != IB_QPT_RC)
- memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr));
- else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
- wr->opcode == IB_WR_RDMA_WRITE ||
- wr->opcode == IB_WR_RDMA_READ)
- memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr));
- else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
- wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
- memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr));
- else
- memcpy(&wqe->wr, wr, sizeof(wqe->wr));
+ /* cplen has length from above */
+ memcpy(&wqe->wr, wr, cplen);
wqe->length = 0;
j = 0;
@@ -1565,14 +1687,29 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
}
- wqe->ssn = qp->s_ssn++;
- wqe->psn = qp->s_next_psn;
- wqe->lpsn = wqe->psn +
- (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0);
- qp->s_next_psn = wqe->lpsn + 1;
+ if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
+ if (local_ops_delayed)
+ atomic_inc(&qp->local_ops_pending);
+ else
+ wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
+ wqe->ssn = 0;
+ wqe->psn = 0;
+ wqe->lpsn = 0;
+ } else {
+ wqe->ssn = qp->s_ssn++;
+ wqe->psn = qp->s_next_psn;
+ wqe->lpsn = wqe->psn +
+ (wqe->length ?
+ ((wqe->length - 1) >> log_pmtu) :
+ 0);
+ qp->s_next_psn = wqe->lpsn + 1;
+ }
trace_rvt_post_one_wr(qp, wqe);
+ if (unlikely(reserved_op))
+ rvt_qp_wqe_reserve(qp, wqe);
+ else
+ qp->s_avail--;
smp_wmb(); /* see request builders */
- qp->s_avail--;
qp->s_head = next;
return 0;
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 6caf5272ba1f..d430c2f7cec4 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -106,6 +106,19 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports)
}
EXPORT_SYMBOL(rvt_alloc_device);
+/**
+ * rvt_dealloc_device - deallocate rdi
+ * @rdi: structure to free
+ *
+ * Free a structure allocated with rvt_alloc_device()
+ */
+void rvt_dealloc_device(struct rvt_dev_info *rdi)
+{
+ kfree(rdi->ports);
+ ib_dealloc_device(&rdi->ibdev);
+}
+EXPORT_SYMBOL(rvt_dealloc_device);
+
static int rvt_query_device(struct ib_device *ibdev,
struct ib_device_attr *props,
struct ib_udata *uhw)
@@ -357,6 +370,7 @@ enum {
REG_USER_MR,
DEREG_MR,
ALLOC_MR,
+ MAP_MR_SG,
ALLOC_FMR,
MAP_PHYS_FMR,
UNMAP_FMR,
@@ -488,9 +502,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
!rdi->driver_f.quiesce_qp ||
!rdi->driver_f.notify_error_qp ||
!rdi->driver_f.mtu_from_qp ||
- !rdi->driver_f.mtu_to_path_mtu ||
- !rdi->driver_f.shut_down_port ||
- !rdi->driver_f.cap_mask_chg)
+ !rdi->driver_f.mtu_to_path_mtu)
return -EINVAL;
break;
@@ -517,7 +529,8 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
post_send),
rvt_post_send))
if (!rdi->driver_f.schedule_send ||
- !rdi->driver_f.do_send)
+ !rdi->driver_f.do_send ||
+ !rdi->post_parms)
return -EINVAL;
break;
@@ -622,6 +635,12 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
rvt_alloc_mr);
break;
+ case MAP_MR_SG:
+ check_driver_override(rdi, offsetof(struct ib_device,
+ map_mr_sg),
+ rvt_map_mr_sg);
+ break;
+
case MAP_PHYS_FMR:
check_driver_override(rdi, offsetof(struct ib_device,
map_phys_fmr),
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig
new file mode 100644
index 000000000000..1e4e628fe7b0
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/Kconfig
@@ -0,0 +1,24 @@
+config RDMA_RXE
+ tristate "Software RDMA over Ethernet (RoCE) driver"
+ depends on INET && PCI && INFINIBAND
+ depends on NET_UDP_TUNNEL
+ ---help---
+ This driver implements the InfiniBand RDMA transport over
+ the Linux network stack. It enables a system with a
+ standard Ethernet adapter to interoperate with a RoCE
+ adapter or with another system running the RXE driver.
+ Documentation on InfiniBand and RoCE can be downloaded at
+ www.infinibandta.org and www.openfabrics.org. (See also
+ siw which is a similar software driver for iWARP.)
+
+ The driver is split into two layers, one interfaces with the
+ Linux RDMA stack and implements a kernel or user space
+ verbs API. The user space verbs API requires a support
+ library named librxe which is loaded by the generic user
+ space verbs API, libibverbs. The other layer interfaces
+ with the Linux network stack at layer 3.
+
+ To configure and work with soft-RoCE driver please use the
+ following wiki page under "configure Soft-RoCE (RXE)" section:
+
+ https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home
diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile
new file mode 100644
index 000000000000..3b3fb9d1c470
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/Makefile
@@ -0,0 +1,24 @@
+obj-$(CONFIG_RDMA_RXE) += rdma_rxe.o
+
+rdma_rxe-y := \
+ rxe.o \
+ rxe_comp.o \
+ rxe_req.o \
+ rxe_resp.o \
+ rxe_recv.o \
+ rxe_pool.o \
+ rxe_queue.o \
+ rxe_verbs.o \
+ rxe_av.o \
+ rxe_srq.o \
+ rxe_qp.o \
+ rxe_cq.o \
+ rxe_mr.o \
+ rxe_dma.o \
+ rxe_opcode.o \
+ rxe_mmap.o \
+ rxe_icrc.o \
+ rxe_mcast.o \
+ rxe_task.o \
+ rxe_net.o \
+ rxe_sysfs.o
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
new file mode 100644
index 000000000000..ddd59270ff6d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib");
+MODULE_DESCRIPTION("Soft RDMA transport");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("0.2");
+
+/* free resources for all ports on a device */
+static void rxe_cleanup_ports(struct rxe_dev *rxe)
+{
+ kfree(rxe->port.pkey_tbl);
+ rxe->port.pkey_tbl = NULL;
+
+}
+
+/* free resources for a rxe device all objects created for this device must
+ * have been destroyed
+ */
+static void rxe_cleanup(struct rxe_dev *rxe)
+{
+ rxe_pool_cleanup(&rxe->uc_pool);
+ rxe_pool_cleanup(&rxe->pd_pool);
+ rxe_pool_cleanup(&rxe->ah_pool);
+ rxe_pool_cleanup(&rxe->srq_pool);
+ rxe_pool_cleanup(&rxe->qp_pool);
+ rxe_pool_cleanup(&rxe->cq_pool);
+ rxe_pool_cleanup(&rxe->mr_pool);
+ rxe_pool_cleanup(&rxe->mw_pool);
+ rxe_pool_cleanup(&rxe->mc_grp_pool);
+ rxe_pool_cleanup(&rxe->mc_elem_pool);
+
+ rxe_cleanup_ports(rxe);
+}
+
+/* called when all references have been dropped */
+void rxe_release(struct kref *kref)
+{
+ struct rxe_dev *rxe = container_of(kref, struct rxe_dev, ref_cnt);
+
+ rxe_cleanup(rxe);
+ ib_dealloc_device(&rxe->ib_dev);
+}
+
+void rxe_dev_put(struct rxe_dev *rxe)
+{
+ kref_put(&rxe->ref_cnt, rxe_release);
+}
+EXPORT_SYMBOL_GPL(rxe_dev_put);
+
+/* initialize rxe device parameters */
+static int rxe_init_device_param(struct rxe_dev *rxe)
+{
+ rxe->max_inline_data = RXE_MAX_INLINE_DATA;
+
+ rxe->attr.fw_ver = RXE_FW_VER;
+ rxe->attr.max_mr_size = RXE_MAX_MR_SIZE;
+ rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP;
+ rxe->attr.vendor_id = RXE_VENDOR_ID;
+ rxe->attr.vendor_part_id = RXE_VENDOR_PART_ID;
+ rxe->attr.hw_ver = RXE_HW_VER;
+ rxe->attr.max_qp = RXE_MAX_QP;
+ rxe->attr.max_qp_wr = RXE_MAX_QP_WR;
+ rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS;
+ rxe->attr.max_sge = RXE_MAX_SGE;
+ rxe->attr.max_sge_rd = RXE_MAX_SGE_RD;
+ rxe->attr.max_cq = RXE_MAX_CQ;
+ rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1;
+ rxe->attr.max_mr = RXE_MAX_MR;
+ rxe->attr.max_pd = RXE_MAX_PD;
+ rxe->attr.max_qp_rd_atom = RXE_MAX_QP_RD_ATOM;
+ rxe->attr.max_ee_rd_atom = RXE_MAX_EE_RD_ATOM;
+ rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM;
+ rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM;
+ rxe->attr.max_ee_init_rd_atom = RXE_MAX_EE_INIT_RD_ATOM;
+ rxe->attr.atomic_cap = RXE_ATOMIC_CAP;
+ rxe->attr.max_ee = RXE_MAX_EE;
+ rxe->attr.max_rdd = RXE_MAX_RDD;
+ rxe->attr.max_mw = RXE_MAX_MW;
+ rxe->attr.max_raw_ipv6_qp = RXE_MAX_RAW_IPV6_QP;
+ rxe->attr.max_raw_ethy_qp = RXE_MAX_RAW_ETHY_QP;
+ rxe->attr.max_mcast_grp = RXE_MAX_MCAST_GRP;
+ rxe->attr.max_mcast_qp_attach = RXE_MAX_MCAST_QP_ATTACH;
+ rxe->attr.max_total_mcast_qp_attach = RXE_MAX_TOT_MCAST_QP_ATTACH;
+ rxe->attr.max_ah = RXE_MAX_AH;
+ rxe->attr.max_fmr = RXE_MAX_FMR;
+ rxe->attr.max_map_per_fmr = RXE_MAX_MAP_PER_FMR;
+ rxe->attr.max_srq = RXE_MAX_SRQ;
+ rxe->attr.max_srq_wr = RXE_MAX_SRQ_WR;
+ rxe->attr.max_srq_sge = RXE_MAX_SRQ_SGE;
+ rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN;
+ rxe->attr.max_pkeys = RXE_MAX_PKEYS;
+ rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY;
+
+ rxe->max_ucontext = RXE_MAX_UCONTEXT;
+
+ return 0;
+}
+
+/* initialize port attributes */
+static int rxe_init_port_param(struct rxe_port *port)
+{
+ port->attr.state = RXE_PORT_STATE;
+ port->attr.max_mtu = RXE_PORT_MAX_MTU;
+ port->attr.active_mtu = RXE_PORT_ACTIVE_MTU;
+ port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN;
+ port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS;
+ port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ;
+ port->attr.bad_pkey_cntr = RXE_PORT_BAD_PKEY_CNTR;
+ port->attr.qkey_viol_cntr = RXE_PORT_QKEY_VIOL_CNTR;
+ port->attr.pkey_tbl_len = RXE_PORT_PKEY_TBL_LEN;
+ port->attr.lid = RXE_PORT_LID;
+ port->attr.sm_lid = RXE_PORT_SM_LID;
+ port->attr.lmc = RXE_PORT_LMC;
+ port->attr.max_vl_num = RXE_PORT_MAX_VL_NUM;
+ port->attr.sm_sl = RXE_PORT_SM_SL;
+ port->attr.subnet_timeout = RXE_PORT_SUBNET_TIMEOUT;
+ port->attr.init_type_reply = RXE_PORT_INIT_TYPE_REPLY;
+ port->attr.active_width = RXE_PORT_ACTIVE_WIDTH;
+ port->attr.active_speed = RXE_PORT_ACTIVE_SPEED;
+ port->attr.phys_state = RXE_PORT_PHYS_STATE;
+ port->mtu_cap =
+ ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU);
+ port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX);
+
+ return 0;
+}
+
+/* initialize port state, note IB convention that HCA ports are always
+ * numbered from 1
+ */
+static int rxe_init_ports(struct rxe_dev *rxe)
+{
+ struct rxe_port *port = &rxe->port;
+
+ rxe_init_port_param(port);
+
+ if (!port->attr.pkey_tbl_len || !port->attr.gid_tbl_len)
+ return -EINVAL;
+
+ port->pkey_tbl = kcalloc(port->attr.pkey_tbl_len,
+ sizeof(*port->pkey_tbl), GFP_KERNEL);
+
+ if (!port->pkey_tbl)
+ return -ENOMEM;
+
+ port->pkey_tbl[0] = 0xffff;
+ port->port_guid = rxe->ifc_ops->port_guid(rxe);
+
+ spin_lock_init(&port->port_lock);
+
+ return 0;
+}
+
+/* init pools of managed objects */
+static int rxe_init_pools(struct rxe_dev *rxe)
+{
+ int err;
+
+ err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC,
+ rxe->max_ucontext);
+ if (err)
+ goto err1;
+
+ err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD,
+ rxe->attr.max_pd);
+ if (err)
+ goto err2;
+
+ err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH,
+ rxe->attr.max_ah);
+ if (err)
+ goto err3;
+
+ err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ,
+ rxe->attr.max_srq);
+ if (err)
+ goto err4;
+
+ err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP,
+ rxe->attr.max_qp);
+ if (err)
+ goto err5;
+
+ err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ,
+ rxe->attr.max_cq);
+ if (err)
+ goto err6;
+
+ err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR,
+ rxe->attr.max_mr);
+ if (err)
+ goto err7;
+
+ err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW,
+ rxe->attr.max_mw);
+ if (err)
+ goto err8;
+
+ err = rxe_pool_init(rxe, &rxe->mc_grp_pool, RXE_TYPE_MC_GRP,
+ rxe->attr.max_mcast_grp);
+ if (err)
+ goto err9;
+
+ err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM,
+ rxe->attr.max_total_mcast_qp_attach);
+ if (err)
+ goto err10;
+
+ return 0;
+
+err10:
+ rxe_pool_cleanup(&rxe->mc_grp_pool);
+err9:
+ rxe_pool_cleanup(&rxe->mw_pool);
+err8:
+ rxe_pool_cleanup(&rxe->mr_pool);
+err7:
+ rxe_pool_cleanup(&rxe->cq_pool);
+err6:
+ rxe_pool_cleanup(&rxe->qp_pool);
+err5:
+ rxe_pool_cleanup(&rxe->srq_pool);
+err4:
+ rxe_pool_cleanup(&rxe->ah_pool);
+err3:
+ rxe_pool_cleanup(&rxe->pd_pool);
+err2:
+ rxe_pool_cleanup(&rxe->uc_pool);
+err1:
+ return err;
+}
+
+/* initialize rxe device state */
+static int rxe_init(struct rxe_dev *rxe)
+{
+ int err;
+
+ /* init default device parameters */
+ rxe_init_device_param(rxe);
+
+ err = rxe_init_ports(rxe);
+ if (err)
+ goto err1;
+
+ err = rxe_init_pools(rxe);
+ if (err)
+ goto err2;
+
+ /* init pending mmap list */
+ spin_lock_init(&rxe->mmap_offset_lock);
+ spin_lock_init(&rxe->pending_lock);
+ INIT_LIST_HEAD(&rxe->pending_mmaps);
+ INIT_LIST_HEAD(&rxe->list);
+
+ mutex_init(&rxe->usdev_lock);
+
+ return 0;
+
+err2:
+ rxe_cleanup_ports(rxe);
+err1:
+ return err;
+}
+
+int rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
+{
+ struct rxe_port *port = &rxe->port;
+ enum ib_mtu mtu;
+
+ mtu = eth_mtu_int_to_enum(ndev_mtu);
+
+ /* Make sure that new MTU in range */
+ mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256;
+
+ port->attr.active_mtu = mtu;
+ port->mtu_cap = ib_mtu_enum_to_int(mtu);
+
+ return 0;
+}
+EXPORT_SYMBOL(rxe_set_mtu);
+
+/* called by ifc layer to create new rxe device.
+ * The caller should allocate memory for rxe by calling ib_alloc_device.
+ */
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu)
+{
+ int err;
+
+ kref_init(&rxe->ref_cnt);
+
+ err = rxe_init(rxe);
+ if (err)
+ goto err1;
+
+ err = rxe_set_mtu(rxe, mtu);
+ if (err)
+ goto err1;
+
+ err = rxe_register_device(rxe);
+ if (err)
+ goto err1;
+
+ return 0;
+
+err1:
+ rxe_dev_put(rxe);
+ return err;
+}
+EXPORT_SYMBOL(rxe_add);
+
+/* called by the ifc layer to remove a device */
+void rxe_remove(struct rxe_dev *rxe)
+{
+ rxe_unregister_device(rxe);
+
+ rxe_dev_put(rxe);
+}
+EXPORT_SYMBOL(rxe_remove);
+
+static int __init rxe_module_init(void)
+{
+ int err;
+
+ /* initialize slab caches for managed objects */
+ err = rxe_cache_init();
+ if (err) {
+ pr_err("rxe: unable to init object pools\n");
+ return err;
+ }
+
+ err = rxe_net_ipv4_init();
+ if (err) {
+ pr_err("rxe: unable to init ipv4 tunnel\n");
+ rxe_cache_exit();
+ goto exit;
+ }
+
+ err = rxe_net_ipv6_init();
+ if (err) {
+ pr_err("rxe: unable to init ipv6 tunnel\n");
+ rxe_cache_exit();
+ goto exit;
+ }
+
+ err = register_netdevice_notifier(&rxe_net_notifier);
+ if (err) {
+ pr_err("rxe: Failed to rigister netdev notifier\n");
+ goto exit;
+ }
+
+ pr_info("rxe: loaded\n");
+
+ return 0;
+
+exit:
+ rxe_release_udp_tunnel(recv_sockets.sk4);
+ rxe_release_udp_tunnel(recv_sockets.sk6);
+ return err;
+}
+
+static void __exit rxe_module_exit(void)
+{
+ rxe_remove_all();
+ rxe_net_exit();
+ rxe_cache_exit();
+
+ pr_info("rxe: unloaded\n");
+}
+
+module_init(rxe_module_init);
+module_exit(rxe_module_exit);
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
new file mode 100644
index 000000000000..12c71c549f97
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_H
+#define RXE_H
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/crc32.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+#include "rxe_net.h"
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+#include "rxe_param.h"
+#include "rxe_verbs.h"
+
+#define RXE_UVERBS_ABI_VERSION (1)
+
+#define IB_PHYS_STATE_LINK_UP (5)
+#define IB_PHYS_STATE_LINK_DOWN (3)
+
+#define RXE_ROCE_V2_SPORT (0xc000)
+
+int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
+
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu);
+void rxe_remove(struct rxe_dev *rxe);
+void rxe_remove_all(void);
+
+int rxe_rcv(struct sk_buff *skb);
+
+void rxe_dev_put(struct rxe_dev *rxe);
+struct rxe_dev *net_to_rxe(struct net_device *ndev);
+struct rxe_dev *get_rxe_by_name(const char* name);
+
+void rxe_port_up(struct rxe_dev *rxe);
+void rxe_port_down(struct rxe_dev *rxe);
+
+#endif /* RXE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
new file mode 100644
index 000000000000..5c9474212d4e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr)
+{
+ struct rxe_port *port;
+
+ if (attr->port_num != 1) {
+ pr_info("rxe: invalid port_num = %d\n", attr->port_num);
+ return -EINVAL;
+ }
+
+ port = &rxe->port;
+
+ if (attr->ah_flags & IB_AH_GRH) {
+ if (attr->grh.sgid_index > port->attr.gid_tbl_len) {
+ pr_info("rxe: invalid sgid index = %d\n",
+ attr->grh.sgid_index);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num,
+ struct rxe_av *av, struct ib_ah_attr *attr)
+{
+ memset(av, 0, sizeof(*av));
+ memcpy(&av->grh, &attr->grh, sizeof(attr->grh));
+ av->port_num = port_num;
+ return 0;
+}
+
+int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av,
+ struct ib_ah_attr *attr)
+{
+ memcpy(&attr->grh, &av->grh, sizeof(av->grh));
+ attr->port_num = av->port_num;
+ return 0;
+}
+
+int rxe_av_fill_ip_info(struct rxe_dev *rxe,
+ struct rxe_av *av,
+ struct ib_ah_attr *attr,
+ struct ib_gid_attr *sgid_attr,
+ union ib_gid *sgid)
+{
+ rdma_gid2ip(&av->sgid_addr._sockaddr, sgid);
+ rdma_gid2ip(&av->dgid_addr._sockaddr, &attr->grh.dgid);
+ av->network_type = ib_gid_to_network_type(sgid_attr->gid_type, sgid);
+
+ return 0;
+}
+
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt)
+{
+ if (!pkt || !pkt->qp)
+ return NULL;
+
+ if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC)
+ return &pkt->qp->pri_av;
+
+ return (pkt->wqe) ? &pkt->wqe->av : NULL;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
new file mode 100644
index 000000000000..1c59ef2c67aa
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+enum comp_state {
+ COMPST_GET_ACK,
+ COMPST_GET_WQE,
+ COMPST_COMP_WQE,
+ COMPST_COMP_ACK,
+ COMPST_CHECK_PSN,
+ COMPST_CHECK_ACK,
+ COMPST_READ,
+ COMPST_ATOMIC,
+ COMPST_WRITE_SEND,
+ COMPST_UPDATE_COMP,
+ COMPST_ERROR_RETRY,
+ COMPST_RNR_RETRY,
+ COMPST_ERROR,
+ COMPST_EXIT, /* We have an issue, and we want to rerun the completer */
+ COMPST_DONE, /* The completer finished successflly */
+};
+
+static char *comp_state_name[] = {
+ [COMPST_GET_ACK] = "GET ACK",
+ [COMPST_GET_WQE] = "GET WQE",
+ [COMPST_COMP_WQE] = "COMP WQE",
+ [COMPST_COMP_ACK] = "COMP ACK",
+ [COMPST_CHECK_PSN] = "CHECK PSN",
+ [COMPST_CHECK_ACK] = "CHECK ACK",
+ [COMPST_READ] = "READ",
+ [COMPST_ATOMIC] = "ATOMIC",
+ [COMPST_WRITE_SEND] = "WRITE/SEND",
+ [COMPST_UPDATE_COMP] = "UPDATE COMP",
+ [COMPST_ERROR_RETRY] = "ERROR RETRY",
+ [COMPST_RNR_RETRY] = "RNR RETRY",
+ [COMPST_ERROR] = "ERROR",
+ [COMPST_EXIT] = "EXIT",
+ [COMPST_DONE] = "DONE",
+};
+
+static unsigned long rnrnak_usec[32] = {
+ [IB_RNR_TIMER_655_36] = 655360,
+ [IB_RNR_TIMER_000_01] = 10,
+ [IB_RNR_TIMER_000_02] = 20,
+ [IB_RNR_TIMER_000_03] = 30,
+ [IB_RNR_TIMER_000_04] = 40,
+ [IB_RNR_TIMER_000_06] = 60,
+ [IB_RNR_TIMER_000_08] = 80,
+ [IB_RNR_TIMER_000_12] = 120,
+ [IB_RNR_TIMER_000_16] = 160,
+ [IB_RNR_TIMER_000_24] = 240,
+ [IB_RNR_TIMER_000_32] = 320,
+ [IB_RNR_TIMER_000_48] = 480,
+ [IB_RNR_TIMER_000_64] = 640,
+ [IB_RNR_TIMER_000_96] = 960,
+ [IB_RNR_TIMER_001_28] = 1280,
+ [IB_RNR_TIMER_001_92] = 1920,
+ [IB_RNR_TIMER_002_56] = 2560,
+ [IB_RNR_TIMER_003_84] = 3840,
+ [IB_RNR_TIMER_005_12] = 5120,
+ [IB_RNR_TIMER_007_68] = 7680,
+ [IB_RNR_TIMER_010_24] = 10240,
+ [IB_RNR_TIMER_015_36] = 15360,
+ [IB_RNR_TIMER_020_48] = 20480,
+ [IB_RNR_TIMER_030_72] = 30720,
+ [IB_RNR_TIMER_040_96] = 40960,
+ [IB_RNR_TIMER_061_44] = 61410,
+ [IB_RNR_TIMER_081_92] = 81920,
+ [IB_RNR_TIMER_122_88] = 122880,
+ [IB_RNR_TIMER_163_84] = 163840,
+ [IB_RNR_TIMER_245_76] = 245760,
+ [IB_RNR_TIMER_327_68] = 327680,
+ [IB_RNR_TIMER_491_52] = 491520,
+};
+
+static inline unsigned long rnrnak_jiffies(u8 timeout)
+{
+ return max_t(unsigned long,
+ usecs_to_jiffies(rnrnak_usec[timeout]), 1);
+}
+
+static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode)
+{
+ switch (opcode) {
+ case IB_WR_RDMA_WRITE: return IB_WC_RDMA_WRITE;
+ case IB_WR_RDMA_WRITE_WITH_IMM: return IB_WC_RDMA_WRITE;
+ case IB_WR_SEND: return IB_WC_SEND;
+ case IB_WR_SEND_WITH_IMM: return IB_WC_SEND;
+ case IB_WR_RDMA_READ: return IB_WC_RDMA_READ;
+ case IB_WR_ATOMIC_CMP_AND_SWP: return IB_WC_COMP_SWAP;
+ case IB_WR_ATOMIC_FETCH_AND_ADD: return IB_WC_FETCH_ADD;
+ case IB_WR_LSO: return IB_WC_LSO;
+ case IB_WR_SEND_WITH_INV: return IB_WC_SEND;
+ case IB_WR_RDMA_READ_WITH_INV: return IB_WC_RDMA_READ;
+ case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV;
+ case IB_WR_REG_MR: return IB_WC_REG_MR;
+
+ default:
+ return 0xff;
+ }
+}
+
+void retransmit_timer(unsigned long data)
+{
+ struct rxe_qp *qp = (struct rxe_qp *)data;
+
+ if (qp->valid) {
+ qp->comp.timeout = 1;
+ rxe_run_task(&qp->comp.task, 1);
+ }
+}
+
+void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct sk_buff *skb)
+{
+ int must_sched;
+
+ skb_queue_tail(&qp->resp_pkts, skb);
+
+ must_sched = skb_queue_len(&qp->resp_pkts) > 1;
+ rxe_run_task(&qp->comp.task, must_sched);
+}
+
+static inline enum comp_state get_wqe(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe **wqe_p)
+{
+ struct rxe_send_wqe *wqe;
+
+ /* we come here whether or not we found a response packet to see if
+ * there are any posted WQEs
+ */
+ wqe = queue_head(qp->sq.queue);
+ *wqe_p = wqe;
+
+ /* no WQE or requester has not started it yet */
+ if (!wqe || wqe->state == wqe_state_posted)
+ return pkt ? COMPST_DONE : COMPST_EXIT;
+
+ /* WQE does not require an ack */
+ if (wqe->state == wqe_state_done)
+ return COMPST_COMP_WQE;
+
+ /* WQE caused an error */
+ if (wqe->state == wqe_state_error)
+ return COMPST_ERROR;
+
+ /* we have a WQE, if we also have an ack check its PSN */
+ return pkt ? COMPST_CHECK_PSN : COMPST_EXIT;
+}
+
+static inline void reset_retry_counters(struct rxe_qp *qp)
+{
+ qp->comp.retry_cnt = qp->attr.retry_cnt;
+ qp->comp.rnr_retry = qp->attr.rnr_retry;
+}
+
+static inline enum comp_state check_psn(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ s32 diff;
+
+ /* check to see if response is past the oldest WQE. if it is, complete
+ * send/write or error read/atomic
+ */
+ diff = psn_compare(pkt->psn, wqe->last_psn);
+ if (diff > 0) {
+ if (wqe->state == wqe_state_pending) {
+ if (wqe->mask & WR_ATOMIC_OR_READ_MASK)
+ return COMPST_ERROR_RETRY;
+
+ reset_retry_counters(qp);
+ return COMPST_COMP_WQE;
+ } else {
+ return COMPST_DONE;
+ }
+ }
+
+ /* compare response packet to expected response */
+ diff = psn_compare(pkt->psn, qp->comp.psn);
+ if (diff < 0) {
+ /* response is most likely a retried packet if it matches an
+ * uncompleted WQE go complete it else ignore it
+ */
+ if (pkt->psn == wqe->last_psn)
+ return COMPST_COMP_ACK;
+ else
+ return COMPST_DONE;
+ } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) {
+ return COMPST_ERROR_RETRY;
+ } else {
+ return COMPST_CHECK_ACK;
+ }
+}
+
+static inline enum comp_state check_ack(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ unsigned int mask = pkt->mask;
+ u8 syn;
+
+ /* Check the sequence only */
+ switch (qp->comp.opcode) {
+ case -1:
+ /* Will catch all *_ONLY cases. */
+ if (!(mask & RXE_START_MASK))
+ return COMPST_ERROR;
+
+ break;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+ if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE &&
+ pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) {
+ return COMPST_ERROR;
+ }
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ /* Check operation validity. */
+ switch (pkt->opcode) {
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY:
+ syn = aeth_syn(pkt);
+
+ if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+ return COMPST_ERROR;
+
+ /* Fall through (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE
+ * doesn't have an AETH)
+ */
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+ if (wqe->wr.opcode != IB_WR_RDMA_READ &&
+ wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) {
+ return COMPST_ERROR;
+ }
+ reset_retry_counters(qp);
+ return COMPST_READ;
+
+ case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE:
+ syn = aeth_syn(pkt);
+
+ if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+ return COMPST_ERROR;
+
+ if (wqe->wr.opcode != IB_WR_ATOMIC_CMP_AND_SWP &&
+ wqe->wr.opcode != IB_WR_ATOMIC_FETCH_AND_ADD)
+ return COMPST_ERROR;
+ reset_retry_counters(qp);
+ return COMPST_ATOMIC;
+
+ case IB_OPCODE_RC_ACKNOWLEDGE:
+ syn = aeth_syn(pkt);
+ switch (syn & AETH_TYPE_MASK) {
+ case AETH_ACK:
+ reset_retry_counters(qp);
+ return COMPST_WRITE_SEND;
+
+ case AETH_RNR_NAK:
+ return COMPST_RNR_RETRY;
+
+ case AETH_NAK:
+ switch (syn) {
+ case AETH_NAK_PSN_SEQ_ERROR:
+ /* a nak implicitly acks all packets with psns
+ * before
+ */
+ if (psn_compare(pkt->psn, qp->comp.psn) > 0) {
+ qp->comp.psn = pkt->psn;
+ if (qp->req.wait_psn) {
+ qp->req.wait_psn = 0;
+ rxe_run_task(&qp->req.task, 1);
+ }
+ }
+ return COMPST_ERROR_RETRY;
+
+ case AETH_NAK_INVALID_REQ:
+ wqe->status = IB_WC_REM_INV_REQ_ERR;
+ return COMPST_ERROR;
+
+ case AETH_NAK_REM_ACC_ERR:
+ wqe->status = IB_WC_REM_ACCESS_ERR;
+ return COMPST_ERROR;
+
+ case AETH_NAK_REM_OP_ERR:
+ wqe->status = IB_WC_REM_OP_ERR;
+ return COMPST_ERROR;
+
+ default:
+ pr_warn("unexpected nak %x\n", syn);
+ wqe->status = IB_WC_REM_OP_ERR;
+ return COMPST_ERROR;
+ }
+
+ default:
+ return COMPST_ERROR;
+ }
+ break;
+
+ default:
+ pr_warn("unexpected opcode\n");
+ }
+
+ return COMPST_ERROR;
+}
+
+static inline enum comp_state do_read(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ int ret;
+
+ ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE,
+ &wqe->dma, payload_addr(pkt),
+ payload_size(pkt), to_mem_obj, NULL);
+ if (ret)
+ return COMPST_ERROR;
+
+ if (wqe->dma.resid == 0 && (pkt->mask & RXE_END_MASK))
+ return COMPST_COMP_ACK;
+ else
+ return COMPST_UPDATE_COMP;
+}
+
+static inline enum comp_state do_atomic(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ int ret;
+
+ u64 atomic_orig = atmack_orig(pkt);
+
+ ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE,
+ &wqe->dma, &atomic_orig,
+ sizeof(u64), to_mem_obj, NULL);
+ if (ret)
+ return COMPST_ERROR;
+ else
+ return COMPST_COMP_ACK;
+}
+
+static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ struct rxe_cqe *cqe)
+{
+ memset(cqe, 0, sizeof(*cqe));
+
+ if (!qp->is_user) {
+ struct ib_wc *wc = &cqe->ibwc;
+
+ wc->wr_id = wqe->wr.wr_id;
+ wc->status = wqe->status;
+ wc->opcode = wr_to_wc_opcode(wqe->wr.opcode);
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+ wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->byte_len = wqe->dma.length;
+ wc->qp = &qp->ibqp;
+ } else {
+ struct ib_uverbs_wc *uwc = &cqe->uibwc;
+
+ uwc->wr_id = wqe->wr.wr_id;
+ uwc->status = wqe->status;
+ uwc->opcode = wr_to_wc_opcode(wqe->wr.opcode);
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+ wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+ uwc->wc_flags = IB_WC_WITH_IMM;
+ uwc->byte_len = wqe->dma.length;
+ uwc->qp_num = qp->ibqp.qp_num;
+ }
+}
+
+static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ struct rxe_cqe cqe;
+
+ if ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+ (qp->req.state == QP_STATE_ERROR)) {
+ make_send_cqe(qp, wqe, &cqe);
+ rxe_cq_post(qp->scq, &cqe, 0);
+ }
+
+ advance_consumer(qp->sq.queue);
+
+ /*
+ * we completed something so let req run again
+ * if it is trying to fence
+ */
+ if (qp->req.wait_fence) {
+ qp->req.wait_fence = 0;
+ rxe_run_task(&qp->req.task, 1);
+ }
+}
+
+static inline enum comp_state complete_ack(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ unsigned long flags;
+
+ if (wqe->has_rd_atomic) {
+ wqe->has_rd_atomic = 0;
+ atomic_inc(&qp->req.rd_atomic);
+ if (qp->req.need_rd_atomic) {
+ qp->comp.timeout_retry = 0;
+ qp->req.need_rd_atomic = 0;
+ rxe_run_task(&qp->req.task, 1);
+ }
+ }
+
+ if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+ /* state_lock used by requester & completer */
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if ((qp->req.state == QP_STATE_DRAIN) &&
+ (qp->comp.psn == qp->req.psn)) {
+ qp->req.state = QP_STATE_DRAINED;
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_SQ_DRAINED;
+ qp->ibqp.event_handler(&ev,
+ qp->ibqp.qp_context);
+ }
+ } else {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ }
+ }
+
+ do_complete(qp, wqe);
+
+ if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+ return COMPST_UPDATE_COMP;
+ else
+ return COMPST_DONE;
+}
+
+static inline enum comp_state complete_wqe(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ qp->comp.opcode = -1;
+
+ if (pkt) {
+ if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+ qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+ if (qp->req.wait_psn) {
+ qp->req.wait_psn = 0;
+ rxe_run_task(&qp->req.task, 1);
+ }
+ }
+
+ do_complete(qp, wqe);
+
+ return COMPST_GET_WQE;
+}
+
+int rxe_completer(void *arg)
+{
+ struct rxe_qp *qp = (struct rxe_qp *)arg;
+ struct rxe_send_wqe *wqe = wqe;
+ struct sk_buff *skb = NULL;
+ struct rxe_pkt_info *pkt = NULL;
+ enum comp_state state;
+
+ if (!qp->valid) {
+ while ((skb = skb_dequeue(&qp->resp_pkts))) {
+ rxe_drop_ref(qp);
+ kfree_skb(skb);
+ }
+ skb = NULL;
+ pkt = NULL;
+
+ while (queue_head(qp->sq.queue))
+ advance_consumer(qp->sq.queue);
+
+ goto exit;
+ }
+
+ if (qp->req.state == QP_STATE_ERROR) {
+ while ((skb = skb_dequeue(&qp->resp_pkts))) {
+ rxe_drop_ref(qp);
+ kfree_skb(skb);
+ }
+ skb = NULL;
+ pkt = NULL;
+
+ while ((wqe = queue_head(qp->sq.queue))) {
+ wqe->status = IB_WC_WR_FLUSH_ERR;
+ do_complete(qp, wqe);
+ }
+
+ goto exit;
+ }
+
+ if (qp->req.state == QP_STATE_RESET) {
+ while ((skb = skb_dequeue(&qp->resp_pkts))) {
+ rxe_drop_ref(qp);
+ kfree_skb(skb);
+ }
+ skb = NULL;
+ pkt = NULL;
+
+ while (queue_head(qp->sq.queue))
+ advance_consumer(qp->sq.queue);
+
+ goto exit;
+ }
+
+ if (qp->comp.timeout) {
+ qp->comp.timeout_retry = 1;
+ qp->comp.timeout = 0;
+ } else {
+ qp->comp.timeout_retry = 0;
+ }
+
+ if (qp->req.need_retry)
+ goto exit;
+
+ state = COMPST_GET_ACK;
+
+ while (1) {
+ pr_debug("state = %s\n", comp_state_name[state]);
+ switch (state) {
+ case COMPST_GET_ACK:
+ skb = skb_dequeue(&qp->resp_pkts);
+ if (skb) {
+ pkt = SKB_TO_PKT(skb);
+ qp->comp.timeout_retry = 0;
+ }
+ state = COMPST_GET_WQE;
+ break;
+
+ case COMPST_GET_WQE:
+ state = get_wqe(qp, pkt, &wqe);
+ break;
+
+ case COMPST_CHECK_PSN:
+ state = check_psn(qp, pkt, wqe);
+ break;
+
+ case COMPST_CHECK_ACK:
+ state = check_ack(qp, pkt, wqe);
+ break;
+
+ case COMPST_READ:
+ state = do_read(qp, pkt, wqe);
+ break;
+
+ case COMPST_ATOMIC:
+ state = do_atomic(qp, pkt, wqe);
+ break;
+
+ case COMPST_WRITE_SEND:
+ if (wqe->state == wqe_state_pending &&
+ wqe->last_psn == pkt->psn)
+ state = COMPST_COMP_ACK;
+ else
+ state = COMPST_UPDATE_COMP;
+ break;
+
+ case COMPST_COMP_ACK:
+ state = complete_ack(qp, pkt, wqe);
+ break;
+
+ case COMPST_COMP_WQE:
+ state = complete_wqe(qp, pkt, wqe);
+ break;
+
+ case COMPST_UPDATE_COMP:
+ if (pkt->mask & RXE_END_MASK)
+ qp->comp.opcode = -1;
+ else
+ qp->comp.opcode = pkt->opcode;
+
+ if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+ qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+ if (qp->req.wait_psn) {
+ qp->req.wait_psn = 0;
+ rxe_run_task(&qp->req.task, 1);
+ }
+
+ state = COMPST_DONE;
+ break;
+
+ case COMPST_DONE:
+ if (pkt) {
+ rxe_drop_ref(pkt->qp);
+ kfree_skb(skb);
+ }
+ goto done;
+
+ case COMPST_EXIT:
+ if (qp->comp.timeout_retry && wqe) {
+ state = COMPST_ERROR_RETRY;
+ break;
+ }
+
+ /* re reset the timeout counter if
+ * (1) QP is type RC
+ * (2) the QP is alive
+ * (3) there is a packet sent by the requester that
+ * might be acked (we still might get spurious
+ * timeouts but try to keep them as few as possible)
+ * (4) the timeout parameter is set
+ */
+ if ((qp_type(qp) == IB_QPT_RC) &&
+ (qp->req.state == QP_STATE_READY) &&
+ (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
+ qp->qp_timeout_jiffies)
+ mod_timer(&qp->retrans_timer,
+ jiffies + qp->qp_timeout_jiffies);
+ goto exit;
+
+ case COMPST_ERROR_RETRY:
+ /* we come here if the retry timer fired and we did
+ * not receive a response packet. try to retry the send
+ * queue if that makes sense and the limits have not
+ * been exceeded. remember that some timeouts are
+ * spurious since we do not reset the timer but kick
+ * it down the road or let it expire
+ */
+
+ /* there is nothing to retry in this case */
+ if (!wqe || (wqe->state == wqe_state_posted))
+ goto exit;
+
+ if (qp->comp.retry_cnt > 0) {
+ if (qp->comp.retry_cnt != 7)
+ qp->comp.retry_cnt--;
+
+ /* no point in retrying if we have already
+ * seen the last ack that the requester could
+ * have caused
+ */
+ if (psn_compare(qp->req.psn,
+ qp->comp.psn) > 0) {
+ /* tell the requester to retry the
+ * send send queue next time around
+ */
+ qp->req.need_retry = 1;
+ rxe_run_task(&qp->req.task, 1);
+ }
+
+ if (pkt) {
+ rxe_drop_ref(pkt->qp);
+ kfree_skb(skb);
+ }
+
+ goto exit;
+
+ } else {
+ wqe->status = IB_WC_RETRY_EXC_ERR;
+ state = COMPST_ERROR;
+ }
+ break;
+
+ case COMPST_RNR_RETRY:
+ if (qp->comp.rnr_retry > 0) {
+ if (qp->comp.rnr_retry != 7)
+ qp->comp.rnr_retry--;
+
+ qp->req.need_retry = 1;
+ pr_debug("set rnr nak timer\n");
+ mod_timer(&qp->rnr_nak_timer,
+ jiffies + rnrnak_jiffies(aeth_syn(pkt)
+ & ~AETH_TYPE_MASK));
+ goto exit;
+ } else {
+ wqe->status = IB_WC_RNR_RETRY_EXC_ERR;
+ state = COMPST_ERROR;
+ }
+ break;
+
+ case COMPST_ERROR:
+ do_complete(qp, wqe);
+ rxe_qp_error(qp);
+
+ if (pkt) {
+ rxe_drop_ref(pkt->qp);
+ kfree_skb(skb);
+ }
+
+ goto exit;
+ }
+ }
+
+exit:
+ /* we come here if we are done with processing and want the task to
+ * exit from the loop calling us
+ */
+ return -EAGAIN;
+
+done:
+ /* we come here if we have processed a packet we want the task to call
+ * us again to see if there is anything else to do
+ */
+ return 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
new file mode 100644
index 000000000000..e5e6a5e7dee9
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_cq.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+ int cqe, int comp_vector, struct ib_udata *udata)
+{
+ int count;
+
+ if (cqe <= 0) {
+ pr_warn("cqe(%d) <= 0\n", cqe);
+ goto err1;
+ }
+
+ if (cqe > rxe->attr.max_cqe) {
+ pr_warn("cqe(%d) > max_cqe(%d)\n",
+ cqe, rxe->attr.max_cqe);
+ goto err1;
+ }
+
+ if (cq) {
+ count = queue_count(cq->queue);
+ if (cqe < count) {
+ pr_warn("cqe(%d) < current # elements in queue (%d)",
+ cqe, count);
+ goto err1;
+ }
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static void rxe_send_complete(unsigned long data)
+{
+ struct rxe_cq *cq = (struct rxe_cq *)data;
+
+ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+ int comp_vector, struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ int err;
+
+ cq->queue = rxe_queue_init(rxe, &cqe,
+ sizeof(struct rxe_cqe));
+ if (!cq->queue) {
+ pr_warn("unable to create cq\n");
+ return -ENOMEM;
+ }
+
+ err = do_mmap_info(rxe, udata, false, context, cq->queue->buf,
+ cq->queue->buf_size, &cq->queue->ip);
+ if (err) {
+ kvfree(cq->queue->buf);
+ kfree(cq->queue);
+ return err;
+ }
+
+ if (udata)
+ cq->is_user = 1;
+
+ tasklet_init(&cq->comp_task, rxe_send_complete, (unsigned long)cq);
+
+ spin_lock_init(&cq->cq_lock);
+ cq->ibcq.cqe = cqe;
+ return 0;
+}
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, struct ib_udata *udata)
+{
+ int err;
+
+ err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe,
+ sizeof(struct rxe_cqe),
+ cq->queue->ip ? cq->queue->ip->context : NULL,
+ udata, NULL, &cq->cq_lock);
+ if (!err)
+ cq->ibcq.cqe = cqe;
+
+ return err;
+}
+
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
+{
+ struct ib_event ev;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->cq_lock, flags);
+
+ if (unlikely(queue_full(cq->queue))) {
+ spin_unlock_irqrestore(&cq->cq_lock, flags);
+ if (cq->ibcq.event_handler) {
+ ev.device = cq->ibcq.device;
+ ev.element.cq = &cq->ibcq;
+ ev.event = IB_EVENT_CQ_ERR;
+ cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+ }
+
+ return -EBUSY;
+ }
+
+ memcpy(producer_addr(cq->queue), cqe, sizeof(*cqe));
+
+ /* make sure all changes to the CQ are written before we update the
+ * producer pointer
+ */
+ smp_wmb();
+
+ advance_producer(cq->queue);
+ spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+ if ((cq->notify == IB_CQ_NEXT_COMP) ||
+ (cq->notify == IB_CQ_SOLICITED && solicited)) {
+ cq->notify = 0;
+ tasklet_schedule(&cq->comp_task);
+ }
+
+ return 0;
+}
+
+void rxe_cq_cleanup(void *arg)
+{
+ struct rxe_cq *cq = arg;
+
+ if (cq->queue)
+ rxe_queue_cleanup(cq->queue);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_dma.c b/drivers/infiniband/sw/rxe/rxe_dma.c
new file mode 100644
index 000000000000..7634c1a81b2b
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_dma.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+#define DMA_BAD_ADDER ((u64)0)
+
+static int rxe_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ return dma_addr == DMA_BAD_ADDER;
+}
+
+static u64 rxe_dma_map_single(struct ib_device *dev,
+ void *cpu_addr, size_t size,
+ enum dma_data_direction direction)
+{
+ WARN_ON(!valid_dma_direction(direction));
+ return (uintptr_t)cpu_addr;
+}
+
+static void rxe_dma_unmap_single(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ WARN_ON(!valid_dma_direction(direction));
+}
+
+static u64 rxe_dma_map_page(struct ib_device *dev,
+ struct page *page,
+ unsigned long offset,
+ size_t size, enum dma_data_direction direction)
+{
+ u64 addr;
+
+ WARN_ON(!valid_dma_direction(direction));
+
+ if (offset + size > PAGE_SIZE) {
+ addr = DMA_BAD_ADDER;
+ goto done;
+ }
+
+ addr = (uintptr_t)page_address(page);
+ if (addr)
+ addr += offset;
+
+done:
+ return addr;
+}
+
+static void rxe_dma_unmap_page(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ WARN_ON(!valid_dma_direction(direction));
+}
+
+static int rxe_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+ int nents, enum dma_data_direction direction)
+{
+ struct scatterlist *sg;
+ u64 addr;
+ int i;
+ int ret = nents;
+
+ WARN_ON(!valid_dma_direction(direction));
+
+ for_each_sg(sgl, sg, nents, i) {
+ addr = (uintptr_t)page_address(sg_page(sg));
+ if (!addr) {
+ ret = 0;
+ break;
+ }
+ sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+ sg->dma_length = sg->length;
+#endif
+ }
+
+ return ret;
+}
+
+static void rxe_unmap_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ WARN_ON(!valid_dma_direction(direction));
+}
+
+static void rxe_sync_single_for_cpu(struct ib_device *dev,
+ u64 addr,
+ size_t size, enum dma_data_direction dir)
+{
+}
+
+static void rxe_sync_single_for_device(struct ib_device *dev,
+ u64 addr,
+ size_t size, enum dma_data_direction dir)
+{
+}
+
+static void *rxe_dma_alloc_coherent(struct ib_device *dev, size_t size,
+ u64 *dma_handle, gfp_t flag)
+{
+ struct page *p;
+ void *addr = NULL;
+
+ p = alloc_pages(flag, get_order(size));
+ if (p)
+ addr = page_address(p);
+
+ if (dma_handle)
+ *dma_handle = (uintptr_t)addr;
+
+ return addr;
+}
+
+static void rxe_dma_free_coherent(struct ib_device *dev, size_t size,
+ void *cpu_addr, u64 dma_handle)
+{
+ free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops rxe_dma_mapping_ops = {
+ .mapping_error = rxe_mapping_error,
+ .map_single = rxe_dma_map_single,
+ .unmap_single = rxe_dma_unmap_single,
+ .map_page = rxe_dma_map_page,
+ .unmap_page = rxe_dma_unmap_page,
+ .map_sg = rxe_map_sg,
+ .unmap_sg = rxe_unmap_sg,
+ .sync_single_for_cpu = rxe_sync_single_for_cpu,
+ .sync_single_for_device = rxe_sync_single_for_device,
+ .alloc_coherent = rxe_dma_alloc_coherent,
+ .free_coherent = rxe_dma_free_coherent
+};
diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h
new file mode 100644
index 000000000000..d57b5e956ceb
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_hdr.h
@@ -0,0 +1,952 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_HDR_H
+#define RXE_HDR_H
+
+/* extracted information about a packet carried in an sk_buff struct fits in
+ * the skbuff cb array. Must be at most 48 bytes. stored in control block of
+ * sk_buff for received packets.
+ */
+struct rxe_pkt_info {
+ struct rxe_dev *rxe; /* device that owns packet */
+ struct rxe_qp *qp; /* qp that owns packet */
+ struct rxe_send_wqe *wqe; /* send wqe */
+ u8 *hdr; /* points to bth */
+ u32 mask; /* useful info about pkt */
+ u32 psn; /* bth psn of packet */
+ u16 pkey_index; /* partition of pkt */
+ u16 paylen; /* length of bth - icrc */
+ u8 port_num; /* port pkt received on */
+ u8 opcode; /* bth opcode of packet */
+ u8 offset; /* bth offset from pkt->hdr */
+};
+
+/* Macros should be used only for received skb */
+#define SKB_TO_PKT(skb) ((struct rxe_pkt_info *)(skb)->cb)
+#define PKT_TO_SKB(pkt) container_of((void *)(pkt), struct sk_buff, cb)
+
+/*
+ * IBA header types and methods
+ *
+ * Some of these are for reference and completeness only since
+ * rxe does not currently support RD transport
+ * most of this could be moved into IB core. ib_pack.h has
+ * part of this but is incomplete
+ *
+ * Header specific routines to insert/extract values to/from headers
+ * the routines that are named __hhh_(set_)fff() take a pointer to a
+ * hhh header and get(set) the fff field. The routines named
+ * hhh_(set_)fff take a packet info struct and find the
+ * header and field based on the opcode in the packet.
+ * Conversion to/from network byte order from cpu order is also done.
+ */
+
+#define RXE_ICRC_SIZE (4)
+#define RXE_MAX_HDR_LENGTH (80)
+
+/******************************************************************************
+ * Base Transport Header
+ ******************************************************************************/
+struct rxe_bth {
+ u8 opcode;
+ u8 flags;
+ __be16 pkey;
+ __be32 qpn;
+ __be32 apsn;
+};
+
+#define BTH_TVER (0)
+#define BTH_DEF_PKEY (0xffff)
+
+#define BTH_SE_MASK (0x80)
+#define BTH_MIG_MASK (0x40)
+#define BTH_PAD_MASK (0x30)
+#define BTH_TVER_MASK (0x0f)
+#define BTH_FECN_MASK (0x80000000)
+#define BTH_BECN_MASK (0x40000000)
+#define BTH_RESV6A_MASK (0x3f000000)
+#define BTH_QPN_MASK (0x00ffffff)
+#define BTH_ACK_MASK (0x80000000)
+#define BTH_RESV7_MASK (0x7f000000)
+#define BTH_PSN_MASK (0x00ffffff)
+
+static inline u8 __bth_opcode(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return bth->opcode;
+}
+
+static inline void __bth_set_opcode(void *arg, u8 opcode)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->opcode = opcode;
+}
+
+static inline u8 __bth_se(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (BTH_SE_MASK & bth->flags);
+}
+
+static inline void __bth_set_se(void *arg, int se)
+{
+ struct rxe_bth *bth = arg;
+
+ if (se)
+ bth->flags |= BTH_SE_MASK;
+ else
+ bth->flags &= ~BTH_SE_MASK;
+}
+
+static inline u8 __bth_mig(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (BTH_MIG_MASK & bth->flags);
+}
+
+static inline void __bth_set_mig(void *arg, u8 mig)
+{
+ struct rxe_bth *bth = arg;
+
+ if (mig)
+ bth->flags |= BTH_MIG_MASK;
+ else
+ bth->flags &= ~BTH_MIG_MASK;
+}
+
+static inline u8 __bth_pad(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return (BTH_PAD_MASK & bth->flags) >> 4;
+}
+
+static inline void __bth_set_pad(void *arg, u8 pad)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->flags = (BTH_PAD_MASK & (pad << 4)) |
+ (~BTH_PAD_MASK & bth->flags);
+}
+
+static inline u8 __bth_tver(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return BTH_TVER_MASK & bth->flags;
+}
+
+static inline void __bth_set_tver(void *arg, u8 tver)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->flags = (BTH_TVER_MASK & tver) |
+ (~BTH_TVER_MASK & bth->flags);
+}
+
+static inline u16 __bth_pkey(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return be16_to_cpu(bth->pkey);
+}
+
+static inline void __bth_set_pkey(void *arg, u16 pkey)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->pkey = cpu_to_be16(pkey);
+}
+
+static inline u32 __bth_qpn(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return BTH_QPN_MASK & be32_to_cpu(bth->qpn);
+}
+
+static inline void __bth_set_qpn(void *arg, u32 qpn)
+{
+ struct rxe_bth *bth = arg;
+ u32 resvqpn = be32_to_cpu(bth->qpn);
+
+ bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) |
+ (~BTH_QPN_MASK & resvqpn));
+}
+
+static inline int __bth_fecn(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_fecn(void *arg, int fecn)
+{
+ struct rxe_bth *bth = arg;
+
+ if (fecn)
+ bth->qpn |= cpu_to_be32(BTH_FECN_MASK);
+ else
+ bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK);
+}
+
+static inline int __bth_becn(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_becn(void *arg, int becn)
+{
+ struct rxe_bth *bth = arg;
+
+ if (becn)
+ bth->qpn |= cpu_to_be32(BTH_BECN_MASK);
+ else
+ bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK);
+}
+
+static inline u8 __bth_resv6a(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24;
+}
+
+static inline void __bth_set_resv6a(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK);
+}
+
+static inline int __bth_ack(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn);
+}
+
+static inline void __bth_set_ack(void *arg, int ack)
+{
+ struct rxe_bth *bth = arg;
+
+ if (ack)
+ bth->apsn |= cpu_to_be32(BTH_ACK_MASK);
+ else
+ bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK);
+}
+
+static inline void __bth_set_resv7(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK);
+}
+
+static inline u32 __bth_psn(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return BTH_PSN_MASK & be32_to_cpu(bth->apsn);
+}
+
+static inline void __bth_set_psn(void *arg, u32 psn)
+{
+ struct rxe_bth *bth = arg;
+ u32 apsn = be32_to_cpu(bth->apsn);
+
+ bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) |
+ (~BTH_PSN_MASK & apsn));
+}
+
+static inline u8 bth_opcode(struct rxe_pkt_info *pkt)
+{
+ return __bth_opcode(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode)
+{
+ __bth_set_opcode(pkt->hdr + pkt->offset, opcode);
+}
+
+static inline u8 bth_se(struct rxe_pkt_info *pkt)
+{
+ return __bth_se(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_se(struct rxe_pkt_info *pkt, int se)
+{
+ __bth_set_se(pkt->hdr + pkt->offset, se);
+}
+
+static inline u8 bth_mig(struct rxe_pkt_info *pkt)
+{
+ return __bth_mig(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig)
+{
+ __bth_set_mig(pkt->hdr + pkt->offset, mig);
+}
+
+static inline u8 bth_pad(struct rxe_pkt_info *pkt)
+{
+ return __bth_pad(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad)
+{
+ __bth_set_pad(pkt->hdr + pkt->offset, pad);
+}
+
+static inline u8 bth_tver(struct rxe_pkt_info *pkt)
+{
+ return __bth_tver(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver)
+{
+ __bth_set_tver(pkt->hdr + pkt->offset, tver);
+}
+
+static inline u16 bth_pkey(struct rxe_pkt_info *pkt)
+{
+ return __bth_pkey(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey)
+{
+ __bth_set_pkey(pkt->hdr + pkt->offset, pkey);
+}
+
+static inline u32 bth_qpn(struct rxe_pkt_info *pkt)
+{
+ return __bth_qpn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn)
+{
+ __bth_set_qpn(pkt->hdr + pkt->offset, qpn);
+}
+
+static inline int bth_fecn(struct rxe_pkt_info *pkt)
+{
+ return __bth_fecn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn)
+{
+ __bth_set_fecn(pkt->hdr + pkt->offset, fecn);
+}
+
+static inline int bth_becn(struct rxe_pkt_info *pkt)
+{
+ return __bth_becn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn)
+{
+ __bth_set_becn(pkt->hdr + pkt->offset, becn);
+}
+
+static inline u8 bth_resv6a(struct rxe_pkt_info *pkt)
+{
+ return __bth_resv6a(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_resv6a(struct rxe_pkt_info *pkt)
+{
+ __bth_set_resv6a(pkt->hdr + pkt->offset);
+}
+
+static inline int bth_ack(struct rxe_pkt_info *pkt)
+{
+ return __bth_ack(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack)
+{
+ __bth_set_ack(pkt->hdr + pkt->offset, ack);
+}
+
+static inline void bth_set_resv7(struct rxe_pkt_info *pkt)
+{
+ __bth_set_resv7(pkt->hdr + pkt->offset);
+}
+
+static inline u32 bth_psn(struct rxe_pkt_info *pkt)
+{
+ return __bth_psn(pkt->hdr + pkt->offset);
+}
+
+static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn)
+{
+ __bth_set_psn(pkt->hdr + pkt->offset, psn);
+}
+
+static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se,
+ int mig, int pad, u16 pkey, u32 qpn, int ack_req,
+ u32 psn)
+{
+ struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr + pkt->offset);
+
+ bth->opcode = opcode;
+ bth->flags = (pad << 4) & BTH_PAD_MASK;
+ if (se)
+ bth->flags |= BTH_SE_MASK;
+ if (mig)
+ bth->flags |= BTH_MIG_MASK;
+ bth->pkey = cpu_to_be16(pkey);
+ bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK);
+ psn &= BTH_PSN_MASK;
+ if (ack_req)
+ psn |= BTH_ACK_MASK;
+ bth->apsn = cpu_to_be32(psn);
+}
+
+/******************************************************************************
+ * Reliable Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_rdeth {
+ __be32 een;
+};
+
+#define RDETH_EEN_MASK (0x00ffffff)
+
+static inline u8 __rdeth_een(void *arg)
+{
+ struct rxe_rdeth *rdeth = arg;
+
+ return RDETH_EEN_MASK & be32_to_cpu(rdeth->een);
+}
+
+static inline void __rdeth_set_een(void *arg, u32 een)
+{
+ struct rxe_rdeth *rdeth = arg;
+
+ rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een);
+}
+
+static inline u8 rdeth_een(struct rxe_pkt_info *pkt)
+{
+ return __rdeth_een(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_RDETH]);
+}
+
+static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een)
+{
+ __rdeth_set_een(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_RDETH], een);
+}
+
+/******************************************************************************
+ * Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_deth {
+ __be32 qkey;
+ __be32 sqp;
+};
+
+#define GSI_QKEY (0x80010000)
+#define DETH_SQP_MASK (0x00ffffff)
+
+static inline u32 __deth_qkey(void *arg)
+{
+ struct rxe_deth *deth = arg;
+
+ return be32_to_cpu(deth->qkey);
+}
+
+static inline void __deth_set_qkey(void *arg, u32 qkey)
+{
+ struct rxe_deth *deth = arg;
+
+ deth->qkey = cpu_to_be32(qkey);
+}
+
+static inline u32 __deth_sqp(void *arg)
+{
+ struct rxe_deth *deth = arg;
+
+ return DETH_SQP_MASK & be32_to_cpu(deth->sqp);
+}
+
+static inline void __deth_set_sqp(void *arg, u32 sqp)
+{
+ struct rxe_deth *deth = arg;
+
+ deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp);
+}
+
+static inline u32 deth_qkey(struct rxe_pkt_info *pkt)
+{
+ return __deth_qkey(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey)
+{
+ __deth_set_qkey(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey);
+}
+
+static inline u32 deth_sqp(struct rxe_pkt_info *pkt)
+{
+ return __deth_sqp(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp)
+{
+ __deth_set_sqp(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp);
+}
+
+/******************************************************************************
+ * RDMA Extended Transport Header
+ ******************************************************************************/
+struct rxe_reth {
+ __be64 va;
+ __be32 rkey;
+ __be32 len;
+};
+
+static inline u64 __reth_va(void *arg)
+{
+ struct rxe_reth *reth = arg;
+
+ return be64_to_cpu(reth->va);
+}
+
+static inline void __reth_set_va(void *arg, u64 va)
+{
+ struct rxe_reth *reth = arg;
+
+ reth->va = cpu_to_be64(va);
+}
+
+static inline u32 __reth_rkey(void *arg)
+{
+ struct rxe_reth *reth = arg;
+
+ return be32_to_cpu(reth->rkey);
+}
+
+static inline void __reth_set_rkey(void *arg, u32 rkey)
+{
+ struct rxe_reth *reth = arg;
+
+ reth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 __reth_len(void *arg)
+{
+ struct rxe_reth *reth = arg;
+
+ return be32_to_cpu(reth->len);
+}
+
+static inline void __reth_set_len(void *arg, u32 len)
+{
+ struct rxe_reth *reth = arg;
+
+ reth->len = cpu_to_be32(len);
+}
+
+static inline u64 reth_va(struct rxe_pkt_info *pkt)
+{
+ return __reth_va(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+ __reth_set_va(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_RETH], va);
+}
+
+static inline u32 reth_rkey(struct rxe_pkt_info *pkt)
+{
+ return __reth_rkey(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+ __reth_set_rkey(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey);
+}
+
+static inline u32 reth_len(struct rxe_pkt_info *pkt)
+{
+ return __reth_len(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len)
+{
+ __reth_set_len(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_RETH], len);
+}
+
+/******************************************************************************
+ * Atomic Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmeth {
+ __be64 va;
+ __be32 rkey;
+ __be64 swap_add;
+ __be64 comp;
+} __attribute__((__packed__));
+
+static inline u64 __atmeth_va(void *arg)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ return be64_to_cpu(atmeth->va);
+}
+
+static inline void __atmeth_set_va(void *arg, u64 va)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ atmeth->va = cpu_to_be64(va);
+}
+
+static inline u32 __atmeth_rkey(void *arg)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ return be32_to_cpu(atmeth->rkey);
+}
+
+static inline void __atmeth_set_rkey(void *arg, u32 rkey)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ atmeth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u64 __atmeth_swap_add(void *arg)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ return be64_to_cpu(atmeth->swap_add);
+}
+
+static inline void __atmeth_set_swap_add(void *arg, u64 swap_add)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ atmeth->swap_add = cpu_to_be64(swap_add);
+}
+
+static inline u64 __atmeth_comp(void *arg)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ return be64_to_cpu(atmeth->comp);
+}
+
+static inline void __atmeth_set_comp(void *arg, u64 comp)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ atmeth->comp = cpu_to_be64(comp);
+}
+
+static inline u64 atmeth_va(struct rxe_pkt_info *pkt)
+{
+ return __atmeth_va(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+ __atmeth_set_va(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va);
+}
+
+static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt)
+{
+ return __atmeth_rkey(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+ __atmeth_set_rkey(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey);
+}
+
+static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt)
+{
+ return __atmeth_swap_add(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add)
+{
+ __atmeth_set_swap_add(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add);
+}
+
+static inline u64 atmeth_comp(struct rxe_pkt_info *pkt)
+{
+ return __atmeth_comp(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp)
+{
+ __atmeth_set_comp(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp);
+}
+
+/******************************************************************************
+ * Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_aeth {
+ __be32 smsn;
+};
+
+#define AETH_SYN_MASK (0xff000000)
+#define AETH_MSN_MASK (0x00ffffff)
+
+enum aeth_syndrome {
+ AETH_TYPE_MASK = 0xe0,
+ AETH_ACK = 0x00,
+ AETH_RNR_NAK = 0x20,
+ AETH_RSVD = 0x40,
+ AETH_NAK = 0x60,
+ AETH_ACK_UNLIMITED = 0x1f,
+ AETH_NAK_PSN_SEQ_ERROR = 0x60,
+ AETH_NAK_INVALID_REQ = 0x61,
+ AETH_NAK_REM_ACC_ERR = 0x62,
+ AETH_NAK_REM_OP_ERR = 0x63,
+ AETH_NAK_INV_RD_REQ = 0x64,
+};
+
+static inline u8 __aeth_syn(void *arg)
+{
+ struct rxe_aeth *aeth = arg;
+
+ return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24;
+}
+
+static inline void __aeth_set_syn(void *arg, u8 syn)
+{
+ struct rxe_aeth *aeth = arg;
+ u32 smsn = be32_to_cpu(aeth->smsn);
+
+ aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) |
+ (~AETH_SYN_MASK & smsn));
+}
+
+static inline u32 __aeth_msn(void *arg)
+{
+ struct rxe_aeth *aeth = arg;
+
+ return AETH_MSN_MASK & be32_to_cpu(aeth->smsn);
+}
+
+static inline void __aeth_set_msn(void *arg, u32 msn)
+{
+ struct rxe_aeth *aeth = arg;
+ u32 smsn = be32_to_cpu(aeth->smsn);
+
+ aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) |
+ (~AETH_MSN_MASK & smsn));
+}
+
+static inline u8 aeth_syn(struct rxe_pkt_info *pkt)
+{
+ return __aeth_syn(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn)
+{
+ __aeth_set_syn(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_AETH], syn);
+}
+
+static inline u32 aeth_msn(struct rxe_pkt_info *pkt)
+{
+ return __aeth_msn(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn)
+{
+ __aeth_set_msn(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_AETH], msn);
+}
+
+/******************************************************************************
+ * Atomic Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmack {
+ __be64 orig;
+};
+
+static inline u64 __atmack_orig(void *arg)
+{
+ struct rxe_atmack *atmack = arg;
+
+ return be64_to_cpu(atmack->orig);
+}
+
+static inline void __atmack_set_orig(void *arg, u64 orig)
+{
+ struct rxe_atmack *atmack = arg;
+
+ atmack->orig = cpu_to_be64(orig);
+}
+
+static inline u64 atmack_orig(struct rxe_pkt_info *pkt)
+{
+ return __atmack_orig(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMACK]);
+}
+
+static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig)
+{
+ __atmack_set_orig(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig);
+}
+
+/******************************************************************************
+ * Immediate Extended Transport Header
+ ******************************************************************************/
+struct rxe_immdt {
+ __be32 imm;
+};
+
+static inline __be32 __immdt_imm(void *arg)
+{
+ struct rxe_immdt *immdt = arg;
+
+ return immdt->imm;
+}
+
+static inline void __immdt_set_imm(void *arg, __be32 imm)
+{
+ struct rxe_immdt *immdt = arg;
+
+ immdt->imm = imm;
+}
+
+static inline __be32 immdt_imm(struct rxe_pkt_info *pkt)
+{
+ return __immdt_imm(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_IMMDT]);
+}
+
+static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm)
+{
+ __immdt_set_imm(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm);
+}
+
+/******************************************************************************
+ * Invalidate Extended Transport Header
+ ******************************************************************************/
+struct rxe_ieth {
+ __be32 rkey;
+};
+
+static inline u32 __ieth_rkey(void *arg)
+{
+ struct rxe_ieth *ieth = arg;
+
+ return be32_to_cpu(ieth->rkey);
+}
+
+static inline void __ieth_set_rkey(void *arg, u32 rkey)
+{
+ struct rxe_ieth *ieth = arg;
+
+ ieth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 ieth_rkey(struct rxe_pkt_info *pkt)
+{
+ return __ieth_rkey(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_IETH]);
+}
+
+static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+ __ieth_set_rkey(pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey);
+}
+
+enum rxe_hdr_length {
+ RXE_BTH_BYTES = sizeof(struct rxe_bth),
+ RXE_DETH_BYTES = sizeof(struct rxe_deth),
+ RXE_IMMDT_BYTES = sizeof(struct rxe_immdt),
+ RXE_RETH_BYTES = sizeof(struct rxe_reth),
+ RXE_AETH_BYTES = sizeof(struct rxe_aeth),
+ RXE_ATMACK_BYTES = sizeof(struct rxe_atmack),
+ RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth),
+ RXE_IETH_BYTES = sizeof(struct rxe_ieth),
+ RXE_RDETH_BYTES = sizeof(struct rxe_rdeth),
+};
+
+static inline size_t header_size(struct rxe_pkt_info *pkt)
+{
+ return pkt->offset + rxe_opcode[pkt->opcode].length;
+}
+
+static inline void *payload_addr(struct rxe_pkt_info *pkt)
+{
+ return pkt->hdr + pkt->offset
+ + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD];
+}
+
+static inline size_t payload_size(struct rxe_pkt_info *pkt)
+{
+ return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]
+ - bth_pad(pkt) - RXE_ICRC_SIZE;
+}
+
+#endif /* RXE_HDR_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c
new file mode 100644
index 000000000000..413b56b23a06
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_icrc.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* Compute a partial ICRC for all the IB transport headers. */
+u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+ unsigned int bth_offset = 0;
+ struct iphdr *ip4h = NULL;
+ struct ipv6hdr *ip6h = NULL;
+ struct udphdr *udph;
+ struct rxe_bth *bth;
+ int crc;
+ int length;
+ int hdr_size = sizeof(struct udphdr) +
+ (skb->protocol == htons(ETH_P_IP) ?
+ sizeof(struct iphdr) : sizeof(struct ipv6hdr));
+ /* pseudo header buffer size is calculate using ipv6 header size since
+ * it is bigger than ipv4
+ */
+ u8 pshdr[sizeof(struct udphdr) +
+ sizeof(struct ipv6hdr) +
+ RXE_BTH_BYTES];
+
+ /* This seed is the result of computing a CRC with a seed of
+ * 0xfffffff and 8 bytes of 0xff representing a masked LRH.
+ */
+ crc = 0xdebb20e3;
+
+ if (skb->protocol == htons(ETH_P_IP)) { /* IPv4 */
+ memcpy(pshdr, ip_hdr(skb), hdr_size);
+ ip4h = (struct iphdr *)pshdr;
+ udph = (struct udphdr *)(ip4h + 1);
+
+ ip4h->ttl = 0xff;
+ ip4h->check = CSUM_MANGLED_0;
+ ip4h->tos = 0xff;
+ } else { /* IPv6 */
+ memcpy(pshdr, ipv6_hdr(skb), hdr_size);
+ ip6h = (struct ipv6hdr *)pshdr;
+ udph = (struct udphdr *)(ip6h + 1);
+
+ memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl));
+ ip6h->priority = 0xf;
+ ip6h->hop_limit = 0xff;
+ }
+ udph->check = CSUM_MANGLED_0;
+
+ bth_offset += hdr_size;
+
+ memcpy(&pshdr[bth_offset], pkt->hdr, RXE_BTH_BYTES);
+ bth = (struct rxe_bth *)&pshdr[bth_offset];
+
+ /* exclude bth.resv8a */
+ bth->qpn |= cpu_to_be32(~BTH_QPN_MASK);
+
+ length = hdr_size + RXE_BTH_BYTES;
+ crc = crc32_le(crc, pshdr, length);
+
+ /* And finish to compute the CRC on the remainder of the headers. */
+ crc = crc32_le(crc, pkt->hdr + RXE_BTH_BYTES,
+ rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES);
+ return crc;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
new file mode 100644
index 000000000000..4a5484ef604f
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_LOC_H
+#define RXE_LOC_H
+
+/* rxe_av.c */
+
+int rxe_av_chk_attr(struct rxe_dev *rxe, struct ib_ah_attr *attr);
+
+int rxe_av_from_attr(struct rxe_dev *rxe, u8 port_num,
+ struct rxe_av *av, struct ib_ah_attr *attr);
+
+int rxe_av_to_attr(struct rxe_dev *rxe, struct rxe_av *av,
+ struct ib_ah_attr *attr);
+
+int rxe_av_fill_ip_info(struct rxe_dev *rxe,
+ struct rxe_av *av,
+ struct ib_ah_attr *attr,
+ struct ib_gid_attr *sgid_attr,
+ union ib_gid *sgid);
+
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt);
+
+/* rxe_cq.c */
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+ int cqe, int comp_vector, struct ib_udata *udata);
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+ int comp_vector, struct ib_ucontext *context,
+ struct ib_udata *udata);
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, struct ib_udata *udata);
+
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
+
+void rxe_cq_cleanup(void *arg);
+
+/* rxe_mcast.c */
+int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
+ struct rxe_mc_grp **grp_p);
+
+int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct rxe_mc_grp *grp);
+
+int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+ union ib_gid *mgid);
+
+void rxe_drop_all_mcast_groups(struct rxe_qp *qp);
+
+void rxe_mc_cleanup(void *arg);
+
+/* rxe_mmap.c */
+struct rxe_mmap_info {
+ struct list_head pending_mmaps;
+ struct ib_ucontext *context;
+ struct kref ref;
+ void *obj;
+
+ struct mminfo info;
+};
+
+void rxe_mmap_release(struct kref *ref);
+
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev,
+ u32 size,
+ struct ib_ucontext *context,
+ void *obj);
+
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+/* rxe_mr.c */
+enum copy_direction {
+ to_mem_obj,
+ from_mem_obj,
+};
+
+int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd,
+ int access, struct rxe_mem *mem);
+
+int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
+ u64 length, u64 iova, int access, struct ib_udata *udata,
+ struct rxe_mem *mr);
+
+int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd,
+ int max_pages, struct rxe_mem *mem);
+
+int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr,
+ int length, enum copy_direction dir, u32 *crcp);
+
+int copy_data(struct rxe_dev *rxe, struct rxe_pd *pd, int access,
+ struct rxe_dma_info *dma, void *addr, int length,
+ enum copy_direction dir, u32 *crcp);
+
+void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length);
+
+enum lookup_type {
+ lookup_local,
+ lookup_remote,
+};
+
+struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
+ enum lookup_type type);
+
+int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length);
+
+int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
+ u64 *page, int num_pages, u64 iova);
+
+void rxe_mem_cleanup(void *arg);
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length);
+
+/* rxe_qp.c */
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init);
+
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+ struct ib_qp_init_attr *init, struct ib_udata *udata,
+ struct ib_pd *ibpd);
+
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init);
+
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_attr *attr, int mask);
+
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr,
+ int mask, struct ib_udata *udata);
+
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask);
+
+void rxe_qp_error(struct rxe_qp *qp);
+
+void rxe_qp_destroy(struct rxe_qp *qp);
+
+void rxe_qp_cleanup(void *arg);
+
+static inline int qp_num(struct rxe_qp *qp)
+{
+ return qp->ibqp.qp_num;
+}
+
+static inline enum ib_qp_type qp_type(struct rxe_qp *qp)
+{
+ return qp->ibqp.qp_type;
+}
+
+static inline enum ib_qp_state qp_state(struct rxe_qp *qp)
+{
+ return qp->attr.qp_state;
+}
+
+static inline int qp_mtu(struct rxe_qp *qp)
+{
+ if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC)
+ return qp->attr.path_mtu;
+ else
+ return RXE_PORT_MAX_MTU;
+}
+
+static inline int rcv_wqe_size(int max_sge)
+{
+ return sizeof(struct rxe_recv_wqe) +
+ max_sge * sizeof(struct ib_sge);
+}
+
+void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res);
+
+static inline void rxe_advance_resp_resource(struct rxe_qp *qp)
+{
+ qp->resp.res_head++;
+ if (unlikely(qp->resp.res_head == qp->attr.max_rd_atomic))
+ qp->resp.res_head = 0;
+}
+
+void retransmit_timer(unsigned long data);
+void rnr_nak_timer(unsigned long data);
+
+void dump_qp(struct rxe_qp *qp);
+
+/* rxe_srq.c */
+#define IB_SRQ_INIT_MASK (~IB_SRQ_LIMIT)
+
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask);
+
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_init_attr *init,
+ struct ib_ucontext *context, struct ib_udata *udata);
+
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+ struct ib_udata *udata);
+
+extern struct ib_dma_mapping_ops rxe_dma_mapping_ops;
+
+void rxe_release(struct kref *kref);
+
+int rxe_completer(void *arg);
+int rxe_requester(void *arg);
+int rxe_responder(void *arg);
+
+u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb);
+
+void rxe_resp_queue_pkt(struct rxe_dev *rxe,
+ struct rxe_qp *qp, struct sk_buff *skb);
+
+void rxe_comp_queue_pkt(struct rxe_dev *rxe,
+ struct rxe_qp *qp, struct sk_buff *skb);
+
+static inline unsigned wr_opcode_mask(int opcode, struct rxe_qp *qp)
+{
+ return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
+}
+
+static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+ int err;
+ int is_request = pkt->mask & RXE_REQ_MASK;
+
+ if ((is_request && (qp->req.state != QP_STATE_READY)) ||
+ (!is_request && (qp->resp.state != QP_STATE_READY))) {
+ pr_info("Packet dropped. QP is not in ready state\n");
+ goto drop;
+ }
+
+ if (pkt->mask & RXE_LOOPBACK_MASK) {
+ memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
+ err = rxe->ifc_ops->loopback(skb);
+ } else {
+ err = rxe->ifc_ops->send(rxe, pkt, skb);
+ }
+
+ if (err) {
+ rxe->xmit_errors++;
+ return err;
+ }
+
+ atomic_inc(&qp->skb_out);
+
+ if ((qp_type(qp) != IB_QPT_RC) &&
+ (pkt->mask & RXE_END_MASK)) {
+ pkt->wqe->state = wqe_state_done;
+ rxe_run_task(&qp->comp.task, 1);
+ }
+
+ goto done;
+
+drop:
+ kfree_skb(skb);
+ err = 0;
+done:
+ return err;
+}
+
+#endif /* RXE_LOC_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
new file mode 100644
index 000000000000..fa95544ca7e0
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid,
+ struct rxe_mc_grp **grp_p)
+{
+ int err;
+ struct rxe_mc_grp *grp;
+
+ if (rxe->attr.max_mcast_qp_attach == 0) {
+ err = -EINVAL;
+ goto err1;
+ }
+
+ grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
+ if (grp)
+ goto done;
+
+ grp = rxe_alloc(&rxe->mc_grp_pool);
+ if (!grp) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ INIT_LIST_HEAD(&grp->qp_list);
+ spin_lock_init(&grp->mcg_lock);
+ grp->rxe = rxe;
+
+ rxe_add_key(grp, mgid);
+
+ err = rxe->ifc_ops->mcast_add(rxe, mgid);
+ if (err)
+ goto err2;
+
+done:
+ *grp_p = grp;
+ return 0;
+
+err2:
+ rxe_drop_ref(grp);
+err1:
+ return err;
+}
+
+int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct rxe_mc_grp *grp)
+{
+ int err;
+ struct rxe_mc_elem *elem;
+
+ /* check to see of the qp is already a member of the group */
+ spin_lock_bh(&qp->grp_lock);
+ spin_lock_bh(&grp->mcg_lock);
+ list_for_each_entry(elem, &grp->qp_list, qp_list) {
+ if (elem->qp == qp) {
+ err = 0;
+ goto out;
+ }
+ }
+
+ if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ elem = rxe_alloc(&rxe->mc_elem_pool);
+ if (!elem) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* each qp holds a ref on the grp */
+ rxe_add_ref(grp);
+
+ grp->num_qp++;
+ elem->qp = qp;
+ elem->grp = grp;
+
+ list_add(&elem->qp_list, &grp->qp_list);
+ list_add(&elem->grp_list, &qp->grp_list);
+
+ err = 0;
+out:
+ spin_unlock_bh(&grp->mcg_lock);
+ spin_unlock_bh(&qp->grp_lock);
+ return err;
+}
+
+int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp,
+ union ib_gid *mgid)
+{
+ struct rxe_mc_grp *grp;
+ struct rxe_mc_elem *elem, *tmp;
+
+ grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid);
+ if (!grp)
+ goto err1;
+
+ spin_lock_bh(&qp->grp_lock);
+ spin_lock_bh(&grp->mcg_lock);
+
+ list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) {
+ if (elem->qp == qp) {
+ list_del(&elem->qp_list);
+ list_del(&elem->grp_list);
+ grp->num_qp--;
+
+ spin_unlock_bh(&grp->mcg_lock);
+ spin_unlock_bh(&qp->grp_lock);
+ rxe_drop_ref(elem);
+ rxe_drop_ref(grp); /* ref held by QP */
+ rxe_drop_ref(grp); /* ref from get_key */
+ return 0;
+ }
+ }
+
+ spin_unlock_bh(&grp->mcg_lock);
+ spin_unlock_bh(&qp->grp_lock);
+ rxe_drop_ref(grp); /* ref from get_key */
+err1:
+ return -EINVAL;
+}
+
+void rxe_drop_all_mcast_groups(struct rxe_qp *qp)
+{
+ struct rxe_mc_grp *grp;
+ struct rxe_mc_elem *elem;
+
+ while (1) {
+ spin_lock_bh(&qp->grp_lock);
+ if (list_empty(&qp->grp_list)) {
+ spin_unlock_bh(&qp->grp_lock);
+ break;
+ }
+ elem = list_first_entry(&qp->grp_list, struct rxe_mc_elem,
+ grp_list);
+ list_del(&elem->grp_list);
+ spin_unlock_bh(&qp->grp_lock);
+
+ grp = elem->grp;
+ spin_lock_bh(&grp->mcg_lock);
+ list_del(&elem->qp_list);
+ grp->num_qp--;
+ spin_unlock_bh(&grp->mcg_lock);
+ rxe_drop_ref(grp);
+ rxe_drop_ref(elem);
+ }
+}
+
+void rxe_mc_cleanup(void *arg)
+{
+ struct rxe_mc_grp *grp = arg;
+ struct rxe_dev *rxe = grp->rxe;
+
+ rxe_drop_key(grp);
+ rxe->ifc_ops->mcast_delete(rxe, &grp->mgid);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c
new file mode 100644
index 000000000000..54b3c7c99eff
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mmap.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+void rxe_mmap_release(struct kref *ref)
+{
+ struct rxe_mmap_info *ip = container_of(ref,
+ struct rxe_mmap_info, ref);
+ struct rxe_dev *rxe = to_rdev(ip->context->device);
+
+ spin_lock_bh(&rxe->pending_lock);
+
+ if (!list_empty(&ip->pending_mmaps))
+ list_del(&ip->pending_mmaps);
+
+ spin_unlock_bh(&rxe->pending_lock);
+
+ vfree(ip->obj); /* buf */
+ kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the memory region is mapped,
+ * to avoid releasing it.
+ */
+static void rxe_vma_open(struct vm_area_struct *vma)
+{
+ struct rxe_mmap_info *ip = vma->vm_private_data;
+
+ kref_get(&ip->ref);
+}
+
+static void rxe_vma_close(struct vm_area_struct *vma)
+{
+ struct rxe_mmap_info *ip = vma->vm_private_data;
+
+ kref_put(&ip->ref, rxe_mmap_release);
+}
+
+static struct vm_operations_struct rxe_vm_ops = {
+ .open = rxe_vma_open,
+ .close = rxe_vma_close,
+};
+
+/**
+ * rxe_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ struct rxe_dev *rxe = to_rdev(context->device);
+ unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long size = vma->vm_end - vma->vm_start;
+ struct rxe_mmap_info *ip, *pp;
+ int ret;
+
+ /*
+ * Search the device's list of objects waiting for a mmap call.
+ * Normally, this list is very short since a call to create a
+ * CQ, QP, or SRQ is soon followed by a call to mmap().
+ */
+ spin_lock_bh(&rxe->pending_lock);
+ list_for_each_entry_safe(ip, pp, &rxe->pending_mmaps, pending_mmaps) {
+ if (context != ip->context || (__u64)offset != ip->info.offset)
+ continue;
+
+ /* Don't allow a mmap larger than the object. */
+ if (size > ip->info.size) {
+ pr_err("mmap region is larger than the object!\n");
+ spin_unlock_bh(&rxe->pending_lock);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ goto found_it;
+ }
+ pr_warn("unable to find pending mmap info\n");
+ spin_unlock_bh(&rxe->pending_lock);
+ ret = -EINVAL;
+ goto done;
+
+found_it:
+ list_del_init(&ip->pending_mmaps);
+ spin_unlock_bh(&rxe->pending_lock);
+
+ ret = remap_vmalloc_range(vma, ip->obj, 0);
+ if (ret) {
+ pr_err("rxe: err %d from remap_vmalloc_range\n", ret);
+ goto done;
+ }
+
+ vma->vm_ops = &rxe_vm_ops;
+ vma->vm_private_data = ip;
+ rxe_vma_open(vma);
+done:
+ return ret;
+}
+
+/*
+ * Allocate information for rxe_mmap
+ */
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe,
+ u32 size,
+ struct ib_ucontext *context,
+ void *obj)
+{
+ struct rxe_mmap_info *ip;
+
+ ip = kmalloc(sizeof(*ip), GFP_KERNEL);
+ if (!ip)
+ return NULL;
+
+ size = PAGE_ALIGN(size);
+
+ spin_lock_bh(&rxe->mmap_offset_lock);
+
+ if (rxe->mmap_offset == 0)
+ rxe->mmap_offset = PAGE_SIZE;
+
+ ip->info.offset = rxe->mmap_offset;
+ rxe->mmap_offset += size;
+
+ spin_unlock_bh(&rxe->mmap_offset_lock);
+
+ INIT_LIST_HEAD(&ip->pending_mmaps);
+ ip->info.size = size;
+ ip->context = context;
+ ip->obj = obj;
+ kref_init(&ip->ref);
+
+ return ip;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
new file mode 100644
index 000000000000..f3dab6574504
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -0,0 +1,643 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/*
+ * lfsr (linear feedback shift register) with period 255
+ */
+static u8 rxe_get_key(void)
+{
+ static unsigned key = 1;
+
+ key = key << 1;
+
+ key |= (0 != (key & 0x100)) ^ (0 != (key & 0x10))
+ ^ (0 != (key & 0x80)) ^ (0 != (key & 0x40));
+
+ key &= 0xff;
+
+ return key;
+}
+
+int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length)
+{
+ switch (mem->type) {
+ case RXE_MEM_TYPE_DMA:
+ return 0;
+
+ case RXE_MEM_TYPE_MR:
+ case RXE_MEM_TYPE_FMR:
+ return ((iova < mem->iova) ||
+ ((iova + length) > (mem->iova + mem->length))) ?
+ -EFAULT : 0;
+
+ default:
+ return -EFAULT;
+ }
+}
+
+#define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \
+ | IB_ACCESS_REMOTE_WRITE \
+ | IB_ACCESS_REMOTE_ATOMIC)
+
+static void rxe_mem_init(int access, struct rxe_mem *mem)
+{
+ u32 lkey = mem->pelem.index << 8 | rxe_get_key();
+ u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0;
+
+ if (mem->pelem.pool->type == RXE_TYPE_MR) {
+ mem->ibmr.lkey = lkey;
+ mem->ibmr.rkey = rkey;
+ }
+
+ mem->lkey = lkey;
+ mem->rkey = rkey;
+ mem->state = RXE_MEM_STATE_INVALID;
+ mem->type = RXE_MEM_TYPE_NONE;
+ mem->map_shift = ilog2(RXE_BUF_PER_MAP);
+}
+
+void rxe_mem_cleanup(void *arg)
+{
+ struct rxe_mem *mem = arg;
+ int i;
+
+ if (mem->umem)
+ ib_umem_release(mem->umem);
+
+ if (mem->map) {
+ for (i = 0; i < mem->num_map; i++)
+ kfree(mem->map[i]);
+
+ kfree(mem->map);
+ }
+}
+
+static int rxe_mem_alloc(struct rxe_dev *rxe, struct rxe_mem *mem, int num_buf)
+{
+ int i;
+ int num_map;
+ struct rxe_map **map = mem->map;
+
+ num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
+
+ mem->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
+ if (!mem->map)
+ goto err1;
+
+ for (i = 0; i < num_map; i++) {
+ mem->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
+ if (!mem->map[i])
+ goto err2;
+ }
+
+ WARN_ON(!is_power_of_2(RXE_BUF_PER_MAP));
+
+ mem->map_shift = ilog2(RXE_BUF_PER_MAP);
+ mem->map_mask = RXE_BUF_PER_MAP - 1;
+
+ mem->num_buf = num_buf;
+ mem->num_map = num_map;
+ mem->max_buf = num_map * RXE_BUF_PER_MAP;
+
+ return 0;
+
+err2:
+ for (i--; i >= 0; i--)
+ kfree(mem->map[i]);
+
+ kfree(mem->map);
+err1:
+ return -ENOMEM;
+}
+
+int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd,
+ int access, struct rxe_mem *mem)
+{
+ rxe_mem_init(access, mem);
+
+ mem->pd = pd;
+ mem->access = access;
+ mem->state = RXE_MEM_STATE_VALID;
+ mem->type = RXE_MEM_TYPE_DMA;
+
+ return 0;
+}
+
+int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start,
+ u64 length, u64 iova, int access, struct ib_udata *udata,
+ struct rxe_mem *mem)
+{
+ int entry;
+ struct rxe_map **map;
+ struct rxe_phys_buf *buf = NULL;
+ struct ib_umem *umem;
+ struct scatterlist *sg;
+ int num_buf;
+ void *vaddr;
+ int err;
+
+ umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0);
+ if (IS_ERR(umem)) {
+ pr_warn("err %d from rxe_umem_get\n",
+ (int)PTR_ERR(umem));
+ err = -EINVAL;
+ goto err1;
+ }
+
+ mem->umem = umem;
+ num_buf = umem->nmap;
+
+ rxe_mem_init(access, mem);
+
+ err = rxe_mem_alloc(rxe, mem, num_buf);
+ if (err) {
+ pr_warn("err %d from rxe_mem_alloc\n", err);
+ ib_umem_release(umem);
+ goto err1;
+ }
+
+ WARN_ON(!is_power_of_2(umem->page_size));
+
+ mem->page_shift = ilog2(umem->page_size);
+ mem->page_mask = umem->page_size - 1;
+
+ num_buf = 0;
+ map = mem->map;
+ if (length > 0) {
+ buf = map[0]->buf;
+
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+ vaddr = page_address(sg_page(sg));
+ if (!vaddr) {
+ pr_warn("null vaddr\n");
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ buf->addr = (uintptr_t)vaddr;
+ buf->size = umem->page_size;
+ num_buf++;
+ buf++;
+
+ if (num_buf >= RXE_BUF_PER_MAP) {
+ map++;
+ buf = map[0]->buf;
+ num_buf = 0;
+ }
+ }
+ }
+
+ mem->pd = pd;
+ mem->umem = umem;
+ mem->access = access;
+ mem->length = length;
+ mem->iova = iova;
+ mem->va = start;
+ mem->offset = ib_umem_offset(umem);
+ mem->state = RXE_MEM_STATE_VALID;
+ mem->type = RXE_MEM_TYPE_MR;
+
+ return 0;
+
+err1:
+ return err;
+}
+
+int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd,
+ int max_pages, struct rxe_mem *mem)
+{
+ int err;
+
+ rxe_mem_init(0, mem);
+
+ /* In fastreg, we also set the rkey */
+ mem->ibmr.rkey = mem->ibmr.lkey;
+
+ err = rxe_mem_alloc(rxe, mem, max_pages);
+ if (err)
+ goto err1;
+
+ mem->pd = pd;
+ mem->max_buf = max_pages;
+ mem->state = RXE_MEM_STATE_FREE;
+ mem->type = RXE_MEM_TYPE_MR;
+
+ return 0;
+
+err1:
+ return err;
+}
+
+static void lookup_iova(
+ struct rxe_mem *mem,
+ u64 iova,
+ int *m_out,
+ int *n_out,
+ size_t *offset_out)
+{
+ size_t offset = iova - mem->iova + mem->offset;
+ int map_index;
+ int buf_index;
+ u64 length;
+
+ if (likely(mem->page_shift)) {
+ *offset_out = offset & mem->page_mask;
+ offset >>= mem->page_shift;
+ *n_out = offset & mem->map_mask;
+ *m_out = offset >> mem->map_shift;
+ } else {
+ map_index = 0;
+ buf_index = 0;
+
+ length = mem->map[map_index]->buf[buf_index].size;
+
+ while (offset >= length) {
+ offset -= length;
+ buf_index++;
+
+ if (buf_index == RXE_BUF_PER_MAP) {
+ map_index++;
+ buf_index = 0;
+ }
+ length = mem->map[map_index]->buf[buf_index].size;
+ }
+
+ *m_out = map_index;
+ *n_out = buf_index;
+ *offset_out = offset;
+ }
+}
+
+void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length)
+{
+ size_t offset;
+ int m, n;
+ void *addr;
+
+ if (mem->state != RXE_MEM_STATE_VALID) {
+ pr_warn("mem not in valid state\n");
+ addr = NULL;
+ goto out;
+ }
+
+ if (!mem->map) {
+ addr = (void *)(uintptr_t)iova;
+ goto out;
+ }
+
+ if (mem_check_range(mem, iova, length)) {
+ pr_warn("range violation\n");
+ addr = NULL;
+ goto out;
+ }
+
+ lookup_iova(mem, iova, &m, &n, &offset);
+
+ if (offset + length > mem->map[m]->buf[n].size) {
+ pr_warn("crosses page boundary\n");
+ addr = NULL;
+ goto out;
+ }
+
+ addr = (void *)(uintptr_t)mem->map[m]->buf[n].addr + offset;
+
+out:
+ return addr;
+}
+
+/* copy data from a range (vaddr, vaddr+length-1) to or from
+ * a mem object starting at iova. Compute incremental value of
+ * crc32 if crcp is not zero. caller must hold a reference to mem
+ */
+int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length,
+ enum copy_direction dir, u32 *crcp)
+{
+ int err;
+ int bytes;
+ u8 *va;
+ struct rxe_map **map;
+ struct rxe_phys_buf *buf;
+ int m;
+ int i;
+ size_t offset;
+ u32 crc = crcp ? (*crcp) : 0;
+
+ if (mem->type == RXE_MEM_TYPE_DMA) {
+ u8 *src, *dest;
+
+ src = (dir == to_mem_obj) ?
+ addr : ((void *)(uintptr_t)iova);
+
+ dest = (dir == to_mem_obj) ?
+ ((void *)(uintptr_t)iova) : addr;
+
+ if (crcp)
+ *crcp = crc32_le(*crcp, src, length);
+
+ memcpy(dest, src, length);
+
+ return 0;
+ }
+
+ WARN_ON(!mem->map);
+
+ err = mem_check_range(mem, iova, length);
+ if (err) {
+ err = -EFAULT;
+ goto err1;
+ }
+
+ lookup_iova(mem, iova, &m, &i, &offset);
+
+ map = mem->map + m;
+ buf = map[0]->buf + i;
+
+ while (length > 0) {
+ u8 *src, *dest;
+
+ va = (u8 *)(uintptr_t)buf->addr + offset;
+ src = (dir == to_mem_obj) ? addr : va;
+ dest = (dir == to_mem_obj) ? va : addr;
+
+ bytes = buf->size - offset;
+
+ if (bytes > length)
+ bytes = length;
+
+ if (crcp)
+ crc = crc32_le(crc, src, bytes);
+
+ memcpy(dest, src, bytes);
+
+ length -= bytes;
+ addr += bytes;
+
+ offset = 0;
+ buf++;
+ i++;
+
+ if (i == RXE_BUF_PER_MAP) {
+ i = 0;
+ map++;
+ buf = map[0]->buf;
+ }
+ }
+
+ if (crcp)
+ *crcp = crc;
+
+ return 0;
+
+err1:
+ return err;
+}
+
+/* copy data in or out of a wqe, i.e. sg list
+ * under the control of a dma descriptor
+ */
+int copy_data(
+ struct rxe_dev *rxe,
+ struct rxe_pd *pd,
+ int access,
+ struct rxe_dma_info *dma,
+ void *addr,
+ int length,
+ enum copy_direction dir,
+ u32 *crcp)
+{
+ int bytes;
+ struct rxe_sge *sge = &dma->sge[dma->cur_sge];
+ int offset = dma->sge_offset;
+ int resid = dma->resid;
+ struct rxe_mem *mem = NULL;
+ u64 iova;
+ int err;
+
+ if (length == 0)
+ return 0;
+
+ if (length > resid) {
+ err = -EINVAL;
+ goto err2;
+ }
+
+ if (sge->length && (offset < sge->length)) {
+ mem = lookup_mem(pd, access, sge->lkey, lookup_local);
+ if (!mem) {
+ err = -EINVAL;
+ goto err1;
+ }
+ }
+
+ while (length > 0) {
+ bytes = length;
+
+ if (offset >= sge->length) {
+ if (mem) {
+ rxe_drop_ref(mem);
+ mem = NULL;
+ }
+ sge++;
+ dma->cur_sge++;
+ offset = 0;
+
+ if (dma->cur_sge >= dma->num_sge) {
+ err = -ENOSPC;
+ goto err2;
+ }
+
+ if (sge->length) {
+ mem = lookup_mem(pd, access, sge->lkey,
+ lookup_local);
+ if (!mem) {
+ err = -EINVAL;
+ goto err1;
+ }
+ } else {
+ continue;
+ }
+ }
+
+ if (bytes > sge->length - offset)
+ bytes = sge->length - offset;
+
+ if (bytes > 0) {
+ iova = sge->addr + offset;
+
+ err = rxe_mem_copy(mem, iova, addr, bytes, dir, crcp);
+ if (err)
+ goto err2;
+
+ offset += bytes;
+ resid -= bytes;
+ length -= bytes;
+ addr += bytes;
+ }
+ }
+
+ dma->sge_offset = offset;
+ dma->resid = resid;
+
+ if (mem)
+ rxe_drop_ref(mem);
+
+ return 0;
+
+err2:
+ if (mem)
+ rxe_drop_ref(mem);
+err1:
+ return err;
+}
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
+{
+ struct rxe_sge *sge = &dma->sge[dma->cur_sge];
+ int offset = dma->sge_offset;
+ int resid = dma->resid;
+
+ while (length) {
+ unsigned int bytes;
+
+ if (offset >= sge->length) {
+ sge++;
+ dma->cur_sge++;
+ offset = 0;
+ if (dma->cur_sge >= dma->num_sge)
+ return -ENOSPC;
+ }
+
+ bytes = length;
+
+ if (bytes > sge->length - offset)
+ bytes = sge->length - offset;
+
+ offset += bytes;
+ resid -= bytes;
+ length -= bytes;
+ }
+
+ dma->sge_offset = offset;
+ dma->resid = resid;
+
+ return 0;
+}
+
+/* (1) find the mem (mr or mw) corresponding to lkey/rkey
+ * depending on lookup_type
+ * (2) verify that the (qp) pd matches the mem pd
+ * (3) verify that the mem can support the requested access
+ * (4) verify that mem state is valid
+ */
+struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key,
+ enum lookup_type type)
+{
+ struct rxe_mem *mem;
+ struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
+ int index = key >> 8;
+
+ if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) {
+ mem = rxe_pool_get_index(&rxe->mr_pool, index);
+ if (!mem)
+ goto err1;
+ } else {
+ goto err1;
+ }
+
+ if ((type == lookup_local && mem->lkey != key) ||
+ (type == lookup_remote && mem->rkey != key))
+ goto err2;
+
+ if (mem->pd != pd)
+ goto err2;
+
+ if (access && !(access & mem->access))
+ goto err2;
+
+ if (mem->state != RXE_MEM_STATE_VALID)
+ goto err2;
+
+ return mem;
+
+err2:
+ rxe_drop_ref(mem);
+err1:
+ return NULL;
+}
+
+int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem,
+ u64 *page, int num_pages, u64 iova)
+{
+ int i;
+ int num_buf;
+ int err;
+ struct rxe_map **map;
+ struct rxe_phys_buf *buf;
+ int page_size;
+
+ if (num_pages > mem->max_buf) {
+ err = -EINVAL;
+ goto err1;
+ }
+
+ num_buf = 0;
+ page_size = 1 << mem->page_shift;
+ map = mem->map;
+ buf = map[0]->buf;
+
+ for (i = 0; i < num_pages; i++) {
+ buf->addr = *page++;
+ buf->size = page_size;
+ buf++;
+ num_buf++;
+
+ if (num_buf == RXE_BUF_PER_MAP) {
+ map++;
+ buf = map[0]->buf;
+ num_buf = 0;
+ }
+ }
+
+ mem->iova = iova;
+ mem->va = iova;
+ mem->length = num_pages << mem->page_shift;
+ mem->state = RXE_MEM_STATE_VALID;
+
+ return 0;
+
+err1:
+ return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
new file mode 100644
index 000000000000..eedf2f1cafdf
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -0,0 +1,703 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <net/udp_tunnel.h>
+#include <net/sch_generic.h>
+#include <linux/netfilter.h>
+#include <rdma/ib_addr.h>
+
+#include "rxe.h"
+#include "rxe_net.h"
+#include "rxe_loc.h"
+
+static LIST_HEAD(rxe_dev_list);
+static spinlock_t dev_list_lock; /* spinlock for device list */
+
+struct rxe_dev *net_to_rxe(struct net_device *ndev)
+{
+ struct rxe_dev *rxe;
+ struct rxe_dev *found = NULL;
+
+ spin_lock_bh(&dev_list_lock);
+ list_for_each_entry(rxe, &rxe_dev_list, list) {
+ if (rxe->ndev == ndev) {
+ found = rxe;
+ break;
+ }
+ }
+ spin_unlock_bh(&dev_list_lock);
+
+ return found;
+}
+
+struct rxe_dev *get_rxe_by_name(const char* name)
+{
+ struct rxe_dev *rxe;
+ struct rxe_dev *found = NULL;
+
+ spin_lock_bh(&dev_list_lock);
+ list_for_each_entry(rxe, &rxe_dev_list, list) {
+ if (!strcmp(name, rxe->ib_dev.name)) {
+ found = rxe;
+ break;
+ }
+ }
+ spin_unlock_bh(&dev_list_lock);
+ return found;
+}
+
+
+struct rxe_recv_sockets recv_sockets;
+
+static __be64 rxe_mac_to_eui64(struct net_device *ndev)
+{
+ unsigned char *mac_addr = ndev->dev_addr;
+ __be64 eui64;
+ unsigned char *dst = (unsigned char *)&eui64;
+
+ dst[0] = mac_addr[0] ^ 2;
+ dst[1] = mac_addr[1];
+ dst[2] = mac_addr[2];
+ dst[3] = 0xff;
+ dst[4] = 0xfe;
+ dst[5] = mac_addr[3];
+ dst[6] = mac_addr[4];
+ dst[7] = mac_addr[5];
+
+ return eui64;
+}
+
+static __be64 node_guid(struct rxe_dev *rxe)
+{
+ return rxe_mac_to_eui64(rxe->ndev);
+}
+
+static __be64 port_guid(struct rxe_dev *rxe)
+{
+ return rxe_mac_to_eui64(rxe->ndev);
+}
+
+static struct device *dma_device(struct rxe_dev *rxe)
+{
+ struct net_device *ndev;
+
+ ndev = rxe->ndev;
+
+ if (ndev->priv_flags & IFF_802_1Q_VLAN)
+ ndev = vlan_dev_real_dev(ndev);
+
+ return ndev->dev.parent;
+}
+
+static int mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+ int err;
+ unsigned char ll_addr[ETH_ALEN];
+
+ ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+ err = dev_mc_add(rxe->ndev, ll_addr);
+
+ return err;
+}
+
+static int mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+ int err;
+ unsigned char ll_addr[ETH_ALEN];
+
+ ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+ err = dev_mc_del(rxe->ndev, ll_addr);
+
+ return err;
+}
+
+static struct dst_entry *rxe_find_route4(struct net_device *ndev,
+ struct in_addr *saddr,
+ struct in_addr *daddr)
+{
+ struct rtable *rt;
+ struct flowi4 fl = { { 0 } };
+
+ memset(&fl, 0, sizeof(fl));
+ fl.flowi4_oif = ndev->ifindex;
+ memcpy(&fl.saddr, saddr, sizeof(*saddr));
+ memcpy(&fl.daddr, daddr, sizeof(*daddr));
+ fl.flowi4_proto = IPPROTO_UDP;
+
+ rt = ip_route_output_key(&init_net, &fl);
+ if (IS_ERR(rt)) {
+ pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr);
+ return NULL;
+ }
+
+ return &rt->dst;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *rxe_find_route6(struct net_device *ndev,
+ struct in6_addr *saddr,
+ struct in6_addr *daddr)
+{
+ struct dst_entry *ndst;
+ struct flowi6 fl6 = { { 0 } };
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = ndev->ifindex;
+ memcpy(&fl6.saddr, saddr, sizeof(*saddr));
+ memcpy(&fl6.daddr, daddr, sizeof(*daddr));
+ fl6.flowi6_proto = IPPROTO_UDP;
+
+ if (unlikely(ipv6_stub->ipv6_dst_lookup(sock_net(recv_sockets.sk6->sk),
+ recv_sockets.sk6->sk, &ndst, &fl6))) {
+ pr_err_ratelimited("no route to %pI6\n", daddr);
+ goto put;
+ }
+
+ if (unlikely(ndst->error)) {
+ pr_err("no route to %pI6\n", daddr);
+ goto put;
+ }
+
+ return ndst;
+put:
+ dst_release(ndst);
+ return NULL;
+}
+
+#else
+
+static struct dst_entry *rxe_find_route6(struct net_device *ndev,
+ struct in6_addr *saddr,
+ struct in6_addr *daddr)
+{
+ return NULL;
+}
+
+#endif
+
+static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct udphdr *udph;
+ struct net_device *ndev = skb->dev;
+ struct rxe_dev *rxe = net_to_rxe(ndev);
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+ if (!rxe)
+ goto drop;
+
+ if (skb_linearize(skb)) {
+ pr_err("skb_linearize failed\n");
+ goto drop;
+ }
+
+ udph = udp_hdr(skb);
+ pkt->rxe = rxe;
+ pkt->port_num = 1;
+ pkt->hdr = (u8 *)(udph + 1);
+ pkt->mask = RXE_GRH_MASK;
+ pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph);
+
+ return rxe_rcv(skb);
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
+static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
+ bool ipv6)
+{
+ int err;
+ struct socket *sock;
+ struct udp_port_cfg udp_cfg;
+ struct udp_tunnel_sock_cfg tnl_cfg;
+
+ memset(&udp_cfg, 0, sizeof(udp_cfg));
+
+ if (ipv6) {
+ udp_cfg.family = AF_INET6;
+ udp_cfg.ipv6_v6only = 1;
+ } else {
+ udp_cfg.family = AF_INET;
+ }
+
+ udp_cfg.local_udp_port = port;
+
+ /* Create UDP socket */
+ err = udp_sock_create(net, &udp_cfg, &sock);
+ if (err < 0) {
+ pr_err("failed to create udp socket. err = %d\n", err);
+ return ERR_PTR(err);
+ }
+
+ tnl_cfg.sk_user_data = NULL;
+ tnl_cfg.encap_type = 1;
+ tnl_cfg.encap_rcv = rxe_udp_encap_recv;
+ tnl_cfg.encap_destroy = NULL;
+
+ /* Setup UDP tunnel */
+ setup_udp_tunnel_sock(net, sock, &tnl_cfg);
+
+ return sock;
+}
+
+void rxe_release_udp_tunnel(struct socket *sk)
+{
+ if (sk)
+ udp_tunnel_sock_release(sk);
+}
+
+static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port,
+ __be16 dst_port)
+{
+ struct udphdr *udph;
+
+ __skb_push(skb, sizeof(*udph));
+ skb_reset_transport_header(skb);
+ udph = udp_hdr(skb);
+
+ udph->dest = dst_port;
+ udph->source = src_port;
+ udph->len = htons(skb->len);
+ udph->check = 0;
+}
+
+static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr, __u8 proto,
+ __u8 tos, __u8 ttl, __be16 df, bool xnet)
+{
+ struct iphdr *iph;
+
+ skb_scrub_packet(skb, xnet);
+
+ skb_clear_hash(skb);
+ skb_dst_set(skb, dst);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+
+ iph->version = IPVERSION;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->frag_off = df;
+ iph->protocol = proto;
+ iph->tos = tos;
+ iph->daddr = daddr;
+ iph->saddr = saddr;
+ iph->ttl = ttl;
+ __ip_select_ident(dev_net(dst->dev), iph,
+ skb_shinfo(skb)->gso_segs ?: 1);
+ iph->tot_len = htons(skb->len);
+ ip_send_check(iph);
+}
+
+static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
+ struct in6_addr *saddr, struct in6_addr *daddr,
+ __u8 proto, __u8 prio, __u8 ttl)
+{
+ struct ipv6hdr *ip6h;
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
+ | IPSKB_REROUTED);
+ skb_dst_set(skb, dst);
+
+ __skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ip6h, prio, htonl(0));
+ ip6h->payload_len = htons(skb->len);
+ ip6h->nexthdr = proto;
+ ip6h->hop_limit = ttl;
+ ip6h->daddr = *daddr;
+ ip6h->saddr = *saddr;
+ ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
+}
+
+static int prepare4(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av)
+{
+ struct dst_entry *dst;
+ bool xnet = false;
+ __be16 df = htons(IP_DF);
+ struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
+ struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+ dst = rxe_find_route4(rxe->ndev, saddr, daddr);
+ if (!dst) {
+ pr_err("Host not reachable\n");
+ return -EHOSTUNREACH;
+ }
+
+ if (!memcmp(saddr, daddr, sizeof(*daddr)))
+ pkt->mask |= RXE_LOOPBACK_MASK;
+
+ prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
+ htons(ROCE_V2_UDP_DPORT));
+
+ prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
+ av->grh.traffic_class, av->grh.hop_limit, df, xnet);
+ return 0;
+}
+
+static int prepare6(struct rxe_dev *rxe, struct sk_buff *skb, struct rxe_av *av)
+{
+ struct dst_entry *dst;
+ struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
+ struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+ dst = rxe_find_route6(rxe->ndev, saddr, daddr);
+ if (!dst) {
+ pr_err("Host not reachable\n");
+ return -EHOSTUNREACH;
+ }
+
+ if (!memcmp(saddr, daddr, sizeof(*daddr)))
+ pkt->mask |= RXE_LOOPBACK_MASK;
+
+ prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT),
+ htons(ROCE_V2_UDP_DPORT));
+
+ prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP,
+ av->grh.traffic_class,
+ av->grh.hop_limit);
+ return 0;
+}
+
+static int prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb, u32 *crc)
+{
+ int err = 0;
+ struct rxe_av *av = rxe_get_av(pkt);
+
+ if (av->network_type == RDMA_NETWORK_IPV4)
+ err = prepare4(rxe, skb, av);
+ else if (av->network_type == RDMA_NETWORK_IPV6)
+ err = prepare6(rxe, skb, av);
+
+ *crc = rxe_icrc_hdr(pkt, skb);
+
+ return err;
+}
+
+static void rxe_skb_tx_dtor(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct rxe_qp *qp = sk->sk_user_data;
+ int skb_out = atomic_dec_return(&qp->skb_out);
+
+ if (unlikely(qp->need_req_skb &&
+ skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW))
+ rxe_run_task(&qp->req.task, 1);
+}
+
+static int send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+ struct rxe_av *av;
+ int err;
+
+ av = rxe_get_av(pkt);
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ nskb->destructor = rxe_skb_tx_dtor;
+ nskb->sk = pkt->qp->sk->sk;
+
+ if (av->network_type == RDMA_NETWORK_IPV4) {
+ err = ip_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb);
+ } else if (av->network_type == RDMA_NETWORK_IPV6) {
+ err = ip6_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb);
+ } else {
+ pr_err("Unknown layer 3 protocol: %d\n", av->network_type);
+ kfree_skb(nskb);
+ return -EINVAL;
+ }
+
+ if (unlikely(net_xmit_eval(err))) {
+ pr_debug("error sending packet: %d\n", err);
+ return -EAGAIN;
+ }
+
+ kfree_skb(skb);
+
+ return 0;
+}
+
+static int loopback(struct sk_buff *skb)
+{
+ return rxe_rcv(skb);
+}
+
+static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av)
+{
+ return rxe->port.port_guid == av->grh.dgid.global.interface_id;
+}
+
+static struct sk_buff *init_packet(struct rxe_dev *rxe, struct rxe_av *av,
+ int paylen, struct rxe_pkt_info *pkt)
+{
+ unsigned int hdr_len;
+ struct sk_buff *skb;
+
+ if (av->network_type == RDMA_NETWORK_IPV4)
+ hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+ sizeof(struct iphdr);
+ else
+ hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+ sizeof(struct ipv6hdr);
+
+ skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(rxe->ndev),
+ GFP_ATOMIC);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev));
+
+ skb->dev = rxe->ndev;
+ if (av->network_type == RDMA_NETWORK_IPV4)
+ skb->protocol = htons(ETH_P_IP);
+ else
+ skb->protocol = htons(ETH_P_IPV6);
+
+ pkt->rxe = rxe;
+ pkt->port_num = 1;
+ pkt->hdr = skb_put(skb, paylen);
+ pkt->mask |= RXE_GRH_MASK;
+
+ memset(pkt->hdr, 0, paylen);
+
+ return skb;
+}
+
+/*
+ * this is required by rxe_cfg to match rxe devices in
+ * /sys/class/infiniband up with their underlying ethernet devices
+ */
+static char *parent_name(struct rxe_dev *rxe, unsigned int port_num)
+{
+ return rxe->ndev->name;
+}
+
+static enum rdma_link_layer link_layer(struct rxe_dev *rxe,
+ unsigned int port_num)
+{
+ return IB_LINK_LAYER_ETHERNET;
+}
+
+static struct rxe_ifc_ops ifc_ops = {
+ .node_guid = node_guid,
+ .port_guid = port_guid,
+ .dma_device = dma_device,
+ .mcast_add = mcast_add,
+ .mcast_delete = mcast_delete,
+ .prepare = prepare,
+ .send = send,
+ .loopback = loopback,
+ .init_packet = init_packet,
+ .parent_name = parent_name,
+ .link_layer = link_layer,
+};
+
+struct rxe_dev *rxe_net_add(struct net_device *ndev)
+{
+ int err;
+ struct rxe_dev *rxe = NULL;
+
+ rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe));
+ if (!rxe)
+ return NULL;
+
+ rxe->ifc_ops = &ifc_ops;
+ rxe->ndev = ndev;
+
+ err = rxe_add(rxe, ndev->mtu);
+ if (err) {
+ ib_dealloc_device(&rxe->ib_dev);
+ return NULL;
+ }
+
+ spin_lock_bh(&dev_list_lock);
+ list_add_tail(&rxe_dev_list, &rxe->list);
+ spin_unlock_bh(&dev_list_lock);
+ return rxe;
+}
+
+void rxe_remove_all(void)
+{
+ spin_lock_bh(&dev_list_lock);
+ while (!list_empty(&rxe_dev_list)) {
+ struct rxe_dev *rxe =
+ list_first_entry(&rxe_dev_list, struct rxe_dev, list);
+
+ list_del(&rxe->list);
+ spin_unlock_bh(&dev_list_lock);
+ rxe_remove(rxe);
+ spin_lock_bh(&dev_list_lock);
+ }
+ spin_unlock_bh(&dev_list_lock);
+}
+EXPORT_SYMBOL(rxe_remove_all);
+
+static void rxe_port_event(struct rxe_dev *rxe,
+ enum ib_event_type event)
+{
+ struct ib_event ev;
+
+ ev.device = &rxe->ib_dev;
+ ev.element.port_num = 1;
+ ev.event = event;
+
+ ib_dispatch_event(&ev);
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_up(struct rxe_dev *rxe)
+{
+ struct rxe_port *port;
+
+ port = &rxe->port;
+ port->attr.state = IB_PORT_ACTIVE;
+ port->attr.phys_state = IB_PHYS_STATE_LINK_UP;
+
+ rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
+ pr_info("rxe: set %s active\n", rxe->ib_dev.name);
+ return;
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_down(struct rxe_dev *rxe)
+{
+ struct rxe_port *port;
+
+ port = &rxe->port;
+ port->attr.state = IB_PORT_DOWN;
+ port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN;
+
+ rxe_port_event(rxe, IB_EVENT_PORT_ERR);
+ pr_info("rxe: set %s down\n", rxe->ib_dev.name);
+ return;
+}
+
+static int rxe_notify(struct notifier_block *not_blk,
+ unsigned long event,
+ void *arg)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(arg);
+ struct rxe_dev *rxe = net_to_rxe(ndev);
+
+ if (!rxe)
+ goto out;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ list_del(&rxe->list);
+ rxe_remove(rxe);
+ break;
+ case NETDEV_UP:
+ rxe_port_up(rxe);
+ break;
+ case NETDEV_DOWN:
+ rxe_port_down(rxe);
+ break;
+ case NETDEV_CHANGEMTU:
+ pr_info("rxe: %s changed mtu to %d\n", ndev->name, ndev->mtu);
+ rxe_set_mtu(rxe, ndev->mtu);
+ break;
+ case NETDEV_REBOOT:
+ case NETDEV_CHANGE:
+ case NETDEV_GOING_DOWN:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_CHANGENAME:
+ case NETDEV_FEAT_CHANGE:
+ default:
+ pr_info("rxe: ignoring netdev event = %ld for %s\n",
+ event, ndev->name);
+ break;
+ }
+out:
+ return NOTIFY_OK;
+}
+
+struct notifier_block rxe_net_notifier = {
+ .notifier_call = rxe_notify,
+};
+
+int rxe_net_ipv4_init(void)
+{
+ spin_lock_init(&dev_list_lock);
+
+ recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net,
+ htons(ROCE_V2_UDP_DPORT), false);
+ if (IS_ERR(recv_sockets.sk4)) {
+ recv_sockets.sk4 = NULL;
+ pr_err("rxe: Failed to create IPv4 UDP tunnel\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int rxe_net_ipv6_init(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+
+ spin_lock_init(&dev_list_lock);
+
+ recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
+ htons(ROCE_V2_UDP_DPORT), true);
+ if (IS_ERR(recv_sockets.sk6)) {
+ recv_sockets.sk6 = NULL;
+ pr_err("rxe: Failed to create IPv6 UDP tunnel\n");
+ return -1;
+ }
+#endif
+ return 0;
+}
+
+void rxe_net_exit(void)
+{
+ rxe_release_udp_tunnel(recv_sockets.sk6);
+ rxe_release_udp_tunnel(recv_sockets.sk4);
+ unregister_netdevice_notifier(&rxe_net_notifier);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
new file mode 100644
index 000000000000..0daf7f09e5b5
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_NET_H
+#define RXE_NET_H
+
+#include <net/sock.h>
+#include <net/if_inet6.h>
+#include <linux/module.h>
+
+struct rxe_recv_sockets {
+ struct socket *sk4;
+ struct socket *sk6;
+};
+
+extern struct rxe_recv_sockets recv_sockets;
+extern struct notifier_block rxe_net_notifier;
+void rxe_release_udp_tunnel(struct socket *sk);
+
+struct rxe_dev *rxe_net_add(struct net_device *ndev);
+
+int rxe_net_ipv4_init(void);
+int rxe_net_ipv6_init(void);
+void rxe_net_exit(void);
+
+#endif /* RXE_NET_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c
new file mode 100644
index 000000000000..61927c165b59
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.c
@@ -0,0 +1,961 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_pack.h>
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+
+/* useful information about work request opcodes and pkt opcodes in
+ * table form
+ */
+struct rxe_wr_opcode_info rxe_wr_opcode_info[] = {
+ [IB_WR_RDMA_WRITE] = {
+ .name = "IB_WR_RDMA_WRITE",
+ .mask = {
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ },
+ },
+ [IB_WR_RDMA_WRITE_WITH_IMM] = {
+ .name = "IB_WR_RDMA_WRITE_WITH_IMM",
+ .mask = {
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ },
+ },
+ [IB_WR_SEND] = {
+ .name = "IB_WR_SEND",
+ .mask = {
+ [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK,
+ },
+ },
+ [IB_WR_SEND_WITH_IMM] = {
+ .name = "IB_WR_SEND_WITH_IMM",
+ .mask = {
+ [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK,
+ },
+ },
+ [IB_WR_RDMA_READ] = {
+ .name = "IB_WR_RDMA_READ",
+ .mask = {
+ [IB_QPT_RC] = WR_READ_MASK,
+ },
+ },
+ [IB_WR_ATOMIC_CMP_AND_SWP] = {
+ .name = "IB_WR_ATOMIC_CMP_AND_SWP",
+ .mask = {
+ [IB_QPT_RC] = WR_ATOMIC_MASK,
+ },
+ },
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = {
+ .name = "IB_WR_ATOMIC_FETCH_AND_ADD",
+ .mask = {
+ [IB_QPT_RC] = WR_ATOMIC_MASK,
+ },
+ },
+ [IB_WR_LSO] = {
+ .name = "IB_WR_LSO",
+ .mask = {
+ /* not supported */
+ },
+ },
+ [IB_WR_SEND_WITH_INV] = {
+ .name = "IB_WR_SEND_WITH_INV",
+ .mask = {
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK,
+ },
+ },
+ [IB_WR_RDMA_READ_WITH_INV] = {
+ .name = "IB_WR_RDMA_READ_WITH_INV",
+ .mask = {
+ [IB_QPT_RC] = WR_READ_MASK,
+ },
+ },
+ [IB_WR_LOCAL_INV] = {
+ .name = "IB_WR_LOCAL_INV",
+ .mask = {
+ [IB_QPT_RC] = WR_REG_MASK,
+ },
+ },
+ [IB_WR_REG_MR] = {
+ .name = "IB_WR_REG_MR",
+ .mask = {
+ [IB_QPT_RC] = WR_REG_MASK,
+ },
+ },
+};
+
+struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
+ [IB_OPCODE_RC_SEND_FIRST] = {
+ .name = "IB_OPCODE_RC_SEND_FIRST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK
+ | RXE_SEND_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_MIDDLE] = {
+ .name = "IB_OPCODE_RC_SEND_MIDDLE]",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK
+ | RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_LAST] = {
+ .name = "IB_OPCODE_RC_SEND_LAST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+ | RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_ONLY] = {
+ .name = "IB_OPCODE_RC_SEND_ONLY",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+ | RXE_RWR_MASK | RXE_SEND_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_FIRST] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_FIRST",
+ .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_WRITE_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_MIDDLE",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+ | RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_LAST] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_LAST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY",
+ .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_WRITE_MASK | RXE_START_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_WRITE_MASK
+ | RXE_COMP_MASK | RXE_RWR_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_REQUEST] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_REQUEST",
+ .mask = RXE_RETH_MASK | RXE_REQ_MASK | RXE_READ_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST",
+ .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+ | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE",
+ .mask = RXE_PAYLOAD_MASK | RXE_ACK_MASK | RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST",
+ .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY",
+ .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RC_ACKNOWLEDGE",
+ .mask = RXE_AETH_MASK | RXE_ACK_MASK | RXE_START_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE",
+ .mask = RXE_AETH_MASK | RXE_ATMACK_MASK | RXE_ACK_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_ATMACK] = RXE_BTH_BYTES
+ + RXE_AETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_ATMACK_BYTES + RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_COMPARE_SWAP] = {
+ .name = "IB_OPCODE_RC_COMPARE_SWAP",
+ .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_ATMETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_ATMETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_FETCH_ADD] = {
+ .name = "IB_OPCODE_RC_FETCH_ADD",
+ .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_ATMETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_ATMETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = {
+ .name = "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE",
+ .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_IETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = {
+ .name = "IB_OPCODE_RC_SEND_ONLY_INV",
+ .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_IETH_BYTES,
+ }
+ },
+
+ /* UC */
+ [IB_OPCODE_UC_SEND_FIRST] = {
+ .name = "IB_OPCODE_UC_SEND_FIRST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK
+ | RXE_SEND_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_MIDDLE] = {
+ .name = "IB_OPCODE_UC_SEND_MIDDLE",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK
+ | RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_LAST] = {
+ .name = "IB_OPCODE_UC_SEND_LAST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+ | RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_ONLY] = {
+ .name = "IB_OPCODE_UC_SEND_ONLY",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK
+ | RXE_RWR_MASK | RXE_SEND_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_FIRST] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_FIRST",
+ .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_WRITE_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_MIDDLE",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+ | RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_LAST] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_LAST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY",
+ .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_WRITE_MASK | RXE_START_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_WRITE_MASK
+ | RXE_COMP_MASK | RXE_RWR_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+
+ /* RD */
+ [IB_OPCODE_RD_SEND_FIRST] = {
+ .name = "IB_OPCODE_RD_SEND_FIRST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+ | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_MIDDLE] = {
+ .name = "IB_OPCODE_RD_SEND_MIDDLE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_SEND_MASK
+ | RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_LAST] = {
+ .name = "IB_OPCODE_RD_SEND_LAST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_COMP_MASK | RXE_SEND_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+ | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_COMP_MASK | RXE_SEND_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+ + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_ONLY] = {
+ .name = "IB_OPCODE_RD_SEND_ONLY",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+ | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+ | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+ + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_FIRST] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_FIRST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+ | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_WRITE_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+ + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_RETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES
+ + RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_MIDDLE] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_MIDDLE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_WRITE_MASK
+ | RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_LAST] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_LAST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_WRITE_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK
+ | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES
+ + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_ONLY] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+ | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_WRITE_MASK | RXE_START_MASK
+ | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+ + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_RETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES
+ + RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+ | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_WRITE_MASK
+ | RXE_COMP_MASK | RXE_RWR_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES
+ + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_RETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES
+ + RXE_RETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES
+ + RXE_RETH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_REQUEST] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_REQUEST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK
+ | RXE_REQ_MASK | RXE_READ_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES
+ + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_RETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RETH_BYTES
+ + RXE_DETH_BYTES
+ + RXE_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK
+ | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+ | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE",
+ .mask = RXE_RDETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK
+ | RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK
+ | RXE_ACK_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK
+ | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RD_ACKNOWLEDGE",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ACK_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ATMACK_MASK
+ | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES
+ + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_ATMACK] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_COMPARE_SWAP] = {
+ .name = "RD_COMPARE_SWAP",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK
+ | RXE_REQ_MASK | RXE_ATOMIC_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES
+ + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_ATMETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ + RXE_ATMETH_BYTES
+ + RXE_DETH_BYTES +
+ + RXE_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_FETCH_ADD] = {
+ .name = "IB_OPCODE_RD_FETCH_ADD",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK
+ | RXE_REQ_MASK | RXE_ATOMIC_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES
+ + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES,
+ [RXE_ATMETH] = RXE_BTH_BYTES
+ + RXE_RDETH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ + RXE_ATMETH_BYTES
+ + RXE_DETH_BYTES +
+ + RXE_RDETH_BYTES,
+ }
+ },
+
+ /* UD */
+ [IB_OPCODE_UD_SEND_ONLY] = {
+ .name = "IB_OPCODE_UD_SEND_ONLY",
+ .mask = RXE_DETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK
+ | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK
+ | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_DETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_DETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK
+ | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK
+ | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_DETH] = RXE_BTH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES
+ + RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES
+ + RXE_DETH_BYTES
+ + RXE_IMMDT_BYTES,
+ }
+ },
+
+};
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h
new file mode 100644
index 000000000000..307604e9c78d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_OPCODE_H
+#define RXE_OPCODE_H
+
+/*
+ * contains header bit mask definitions and header lengths
+ * declaration of the rxe_opcode_info struct and
+ * rxe_wr_opcode_info struct
+ */
+
+enum rxe_wr_mask {
+ WR_INLINE_MASK = BIT(0),
+ WR_ATOMIC_MASK = BIT(1),
+ WR_SEND_MASK = BIT(2),
+ WR_READ_MASK = BIT(3),
+ WR_WRITE_MASK = BIT(4),
+ WR_LOCAL_MASK = BIT(5),
+ WR_REG_MASK = BIT(6),
+
+ WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK,
+ WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
+ WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK,
+ WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK,
+};
+
+#define WR_MAX_QPT (8)
+
+struct rxe_wr_opcode_info {
+ char *name;
+ enum rxe_wr_mask mask[WR_MAX_QPT];
+};
+
+extern struct rxe_wr_opcode_info rxe_wr_opcode_info[];
+
+enum rxe_hdr_type {
+ RXE_LRH,
+ RXE_GRH,
+ RXE_BTH,
+ RXE_RETH,
+ RXE_AETH,
+ RXE_ATMETH,
+ RXE_ATMACK,
+ RXE_IETH,
+ RXE_RDETH,
+ RXE_DETH,
+ RXE_IMMDT,
+ RXE_PAYLOAD,
+ NUM_HDR_TYPES
+};
+
+enum rxe_hdr_mask {
+ RXE_LRH_MASK = BIT(RXE_LRH),
+ RXE_GRH_MASK = BIT(RXE_GRH),
+ RXE_BTH_MASK = BIT(RXE_BTH),
+ RXE_IMMDT_MASK = BIT(RXE_IMMDT),
+ RXE_RETH_MASK = BIT(RXE_RETH),
+ RXE_AETH_MASK = BIT(RXE_AETH),
+ RXE_ATMETH_MASK = BIT(RXE_ATMETH),
+ RXE_ATMACK_MASK = BIT(RXE_ATMACK),
+ RXE_IETH_MASK = BIT(RXE_IETH),
+ RXE_RDETH_MASK = BIT(RXE_RDETH),
+ RXE_DETH_MASK = BIT(RXE_DETH),
+ RXE_PAYLOAD_MASK = BIT(RXE_PAYLOAD),
+
+ RXE_REQ_MASK = BIT(NUM_HDR_TYPES + 0),
+ RXE_ACK_MASK = BIT(NUM_HDR_TYPES + 1),
+ RXE_SEND_MASK = BIT(NUM_HDR_TYPES + 2),
+ RXE_WRITE_MASK = BIT(NUM_HDR_TYPES + 3),
+ RXE_READ_MASK = BIT(NUM_HDR_TYPES + 4),
+ RXE_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5),
+
+ RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 6),
+ RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 7),
+
+ RXE_START_MASK = BIT(NUM_HDR_TYPES + 8),
+ RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9),
+ RXE_END_MASK = BIT(NUM_HDR_TYPES + 10),
+
+ RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12),
+
+ RXE_READ_OR_ATOMIC = (RXE_READ_MASK | RXE_ATOMIC_MASK),
+ RXE_WRITE_OR_SEND = (RXE_WRITE_MASK | RXE_SEND_MASK),
+};
+
+#define OPCODE_NONE (-1)
+#define RXE_NUM_OPCODE 256
+
+struct rxe_opcode_info {
+ char *name;
+ enum rxe_hdr_mask mask;
+ int length;
+ int offset[NUM_HDR_TYPES];
+};
+
+extern struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE];
+
+#endif /* RXE_OPCODE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
new file mode 100644
index 000000000000..f459c43a77c8
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_PARAM_H
+#define RXE_PARAM_H
+
+static inline enum ib_mtu rxe_mtu_int_to_enum(int mtu)
+{
+ if (mtu < 256)
+ return 0;
+ else if (mtu < 512)
+ return IB_MTU_256;
+ else if (mtu < 1024)
+ return IB_MTU_512;
+ else if (mtu < 2048)
+ return IB_MTU_1024;
+ else if (mtu < 4096)
+ return IB_MTU_2048;
+ else
+ return IB_MTU_4096;
+}
+
+/* Find the IB mtu for a given network MTU. */
+static inline enum ib_mtu eth_mtu_int_to_enum(int mtu)
+{
+ mtu -= RXE_MAX_HDR_LENGTH;
+
+ return rxe_mtu_int_to_enum(mtu);
+}
+
+/* default/initial rxe device parameter settings */
+enum rxe_device_param {
+ RXE_FW_VER = 0,
+ RXE_MAX_MR_SIZE = -1ull,
+ RXE_PAGE_SIZE_CAP = 0xfffff000,
+ RXE_VENDOR_ID = 0,
+ RXE_VENDOR_PART_ID = 0,
+ RXE_HW_VER = 0,
+ RXE_MAX_QP = 0x10000,
+ RXE_MAX_QP_WR = 0x4000,
+ RXE_MAX_INLINE_DATA = 400,
+ RXE_DEVICE_CAP_FLAGS = IB_DEVICE_BAD_PKEY_CNTR
+ | IB_DEVICE_BAD_QKEY_CNTR
+ | IB_DEVICE_AUTO_PATH_MIG
+ | IB_DEVICE_CHANGE_PHY_PORT
+ | IB_DEVICE_UD_AV_PORT_ENFORCE
+ | IB_DEVICE_PORT_ACTIVE_EVENT
+ | IB_DEVICE_SYS_IMAGE_GUID
+ | IB_DEVICE_RC_RNR_NAK_GEN
+ | IB_DEVICE_SRQ_RESIZE
+ | IB_DEVICE_MEM_MGT_EXTENSIONS,
+ RXE_MAX_SGE = 32,
+ RXE_MAX_SGE_RD = 32,
+ RXE_MAX_CQ = 16384,
+ RXE_MAX_LOG_CQE = 13,
+ RXE_MAX_MR = 2 * 1024,
+ RXE_MAX_PD = 0x7ffc,
+ RXE_MAX_QP_RD_ATOM = 128,
+ RXE_MAX_EE_RD_ATOM = 0,
+ RXE_MAX_RES_RD_ATOM = 0x3f000,
+ RXE_MAX_QP_INIT_RD_ATOM = 128,
+ RXE_MAX_EE_INIT_RD_ATOM = 0,
+ RXE_ATOMIC_CAP = 1,
+ RXE_MAX_EE = 0,
+ RXE_MAX_RDD = 0,
+ RXE_MAX_MW = 0,
+ RXE_MAX_RAW_IPV6_QP = 0,
+ RXE_MAX_RAW_ETHY_QP = 0,
+ RXE_MAX_MCAST_GRP = 8192,
+ RXE_MAX_MCAST_QP_ATTACH = 56,
+ RXE_MAX_TOT_MCAST_QP_ATTACH = 0x70000,
+ RXE_MAX_AH = 100,
+ RXE_MAX_FMR = 0,
+ RXE_MAX_MAP_PER_FMR = 0,
+ RXE_MAX_SRQ = 960,
+ RXE_MAX_SRQ_WR = 0x4000,
+ RXE_MIN_SRQ_WR = 1,
+ RXE_MAX_SRQ_SGE = 27,
+ RXE_MIN_SRQ_SGE = 1,
+ RXE_MAX_FMR_PAGE_LIST_LEN = 512,
+ RXE_MAX_PKEYS = 64,
+ RXE_LOCAL_CA_ACK_DELAY = 15,
+
+ RXE_MAX_UCONTEXT = 512,
+
+ RXE_NUM_PORT = 1,
+ RXE_NUM_COMP_VECTORS = 1,
+
+ RXE_MIN_QP_INDEX = 16,
+ RXE_MAX_QP_INDEX = 0x00020000,
+
+ RXE_MIN_SRQ_INDEX = 0x00020001,
+ RXE_MAX_SRQ_INDEX = 0x00040000,
+
+ RXE_MIN_MR_INDEX = 0x00000001,
+ RXE_MAX_MR_INDEX = 0x00040000,
+ RXE_MIN_MW_INDEX = 0x00040001,
+ RXE_MAX_MW_INDEX = 0x00060000,
+ RXE_MAX_PKT_PER_ACK = 64,
+
+ RXE_MAX_UNACKED_PSNS = 128,
+
+ /* Max inflight SKBs per queue pair */
+ RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64,
+ RXE_INFLIGHT_SKBS_PER_QP_LOW = 16,
+
+ /* Delay before calling arbiter timer */
+ RXE_NSEC_ARB_TIMER_DELAY = 200,
+};
+
+/* default/initial rxe port parameters */
+enum rxe_port_param {
+ RXE_PORT_STATE = IB_PORT_DOWN,
+ RXE_PORT_MAX_MTU = IB_MTU_4096,
+ RXE_PORT_ACTIVE_MTU = IB_MTU_256,
+ RXE_PORT_GID_TBL_LEN = 1024,
+ RXE_PORT_PORT_CAP_FLAGS = RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP,
+ RXE_PORT_MAX_MSG_SZ = 0x800000,
+ RXE_PORT_BAD_PKEY_CNTR = 0,
+ RXE_PORT_QKEY_VIOL_CNTR = 0,
+ RXE_PORT_LID = 0,
+ RXE_PORT_SM_LID = 0,
+ RXE_PORT_SM_SL = 0,
+ RXE_PORT_LMC = 0,
+ RXE_PORT_MAX_VL_NUM = 1,
+ RXE_PORT_SUBNET_TIMEOUT = 0,
+ RXE_PORT_INIT_TYPE_REPLY = 0,
+ RXE_PORT_ACTIVE_WIDTH = IB_WIDTH_1X,
+ RXE_PORT_ACTIVE_SPEED = 1,
+ RXE_PORT_PKEY_TBL_LEN = 64,
+ RXE_PORT_PHYS_STATE = 2,
+ RXE_PORT_SUBNET_PREFIX = 0xfe80000000000000ULL,
+};
+
+/* default/initial port info parameters */
+enum rxe_port_info_param {
+ RXE_PORT_INFO_VL_CAP = 4, /* 1-8 */
+ RXE_PORT_INFO_MTU_CAP = 5, /* 4096 */
+ RXE_PORT_INFO_OPER_VL = 1, /* 1 */
+};
+
+#endif /* RXE_PARAM_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
new file mode 100644
index 000000000000..6bac0717c540
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* info about object pools
+ * note that mr and mw share a single index space
+ * so that one can map an lkey to the correct type of object
+ */
+struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
+ [RXE_TYPE_UC] = {
+ .name = "rxe-uc",
+ .size = sizeof(struct rxe_ucontext),
+ },
+ [RXE_TYPE_PD] = {
+ .name = "rxe-pd",
+ .size = sizeof(struct rxe_pd),
+ },
+ [RXE_TYPE_AH] = {
+ .name = "rxe-ah",
+ .size = sizeof(struct rxe_ah),
+ .flags = RXE_POOL_ATOMIC,
+ },
+ [RXE_TYPE_SRQ] = {
+ .name = "rxe-srq",
+ .size = sizeof(struct rxe_srq),
+ .flags = RXE_POOL_INDEX,
+ .min_index = RXE_MIN_SRQ_INDEX,
+ .max_index = RXE_MAX_SRQ_INDEX,
+ },
+ [RXE_TYPE_QP] = {
+ .name = "rxe-qp",
+ .size = sizeof(struct rxe_qp),
+ .cleanup = rxe_qp_cleanup,
+ .flags = RXE_POOL_INDEX,
+ .min_index = RXE_MIN_QP_INDEX,
+ .max_index = RXE_MAX_QP_INDEX,
+ },
+ [RXE_TYPE_CQ] = {
+ .name = "rxe-cq",
+ .size = sizeof(struct rxe_cq),
+ .cleanup = rxe_cq_cleanup,
+ },
+ [RXE_TYPE_MR] = {
+ .name = "rxe-mr",
+ .size = sizeof(struct rxe_mem),
+ .cleanup = rxe_mem_cleanup,
+ .flags = RXE_POOL_INDEX,
+ .max_index = RXE_MAX_MR_INDEX,
+ .min_index = RXE_MIN_MR_INDEX,
+ },
+ [RXE_TYPE_MW] = {
+ .name = "rxe-mw",
+ .size = sizeof(struct rxe_mem),
+ .flags = RXE_POOL_INDEX,
+ .max_index = RXE_MAX_MW_INDEX,
+ .min_index = RXE_MIN_MW_INDEX,
+ },
+ [RXE_TYPE_MC_GRP] = {
+ .name = "rxe-mc_grp",
+ .size = sizeof(struct rxe_mc_grp),
+ .cleanup = rxe_mc_cleanup,
+ .flags = RXE_POOL_KEY,
+ .key_offset = offsetof(struct rxe_mc_grp, mgid),
+ .key_size = sizeof(union ib_gid),
+ },
+ [RXE_TYPE_MC_ELEM] = {
+ .name = "rxe-mc_elem",
+ .size = sizeof(struct rxe_mc_elem),
+ .flags = RXE_POOL_ATOMIC,
+ },
+};
+
+static inline char *pool_name(struct rxe_pool *pool)
+{
+ return rxe_type_info[pool->type].name;
+}
+
+static inline struct kmem_cache *pool_cache(struct rxe_pool *pool)
+{
+ return rxe_type_info[pool->type].cache;
+}
+
+static inline enum rxe_elem_type rxe_type(void *arg)
+{
+ struct rxe_pool_entry *elem = arg;
+
+ return elem->pool->type;
+}
+
+int rxe_cache_init(void)
+{
+ int err;
+ int i;
+ size_t size;
+ struct rxe_type_info *type;
+
+ for (i = 0; i < RXE_NUM_TYPES; i++) {
+ type = &rxe_type_info[i];
+ size = ALIGN(type->size, RXE_POOL_ALIGN);
+ type->cache = kmem_cache_create(type->name, size,
+ RXE_POOL_ALIGN,
+ RXE_POOL_CACHE_FLAGS, NULL);
+ if (!type->cache) {
+ pr_err("Unable to init kmem cache for %s\n",
+ type->name);
+ err = -ENOMEM;
+ goto err1;
+ }
+ }
+
+ return 0;
+
+err1:
+ while (--i >= 0) {
+ kmem_cache_destroy(type->cache);
+ type->cache = NULL;
+ }
+
+ return err;
+}
+
+void rxe_cache_exit(void)
+{
+ int i;
+ struct rxe_type_info *type;
+
+ for (i = 0; i < RXE_NUM_TYPES; i++) {
+ type = &rxe_type_info[i];
+ kmem_cache_destroy(type->cache);
+ type->cache = NULL;
+ }
+}
+
+static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
+{
+ int err = 0;
+ size_t size;
+
+ if ((max - min + 1) < pool->max_elem) {
+ pr_warn("not enough indices for max_elem\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ pool->max_index = max;
+ pool->min_index = min;
+
+ size = BITS_TO_LONGS(max - min + 1) * sizeof(long);
+ pool->table = kmalloc(size, GFP_KERNEL);
+ if (!pool->table) {
+ pr_warn("no memory for bit table\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ pool->table_size = size;
+ bitmap_zero(pool->table, max - min + 1);
+
+out:
+ return err;
+}
+
+int rxe_pool_init(
+ struct rxe_dev *rxe,
+ struct rxe_pool *pool,
+ enum rxe_elem_type type,
+ unsigned max_elem)
+{
+ int err = 0;
+ size_t size = rxe_type_info[type].size;
+
+ memset(pool, 0, sizeof(*pool));
+
+ pool->rxe = rxe;
+ pool->type = type;
+ pool->max_elem = max_elem;
+ pool->elem_size = ALIGN(size, RXE_POOL_ALIGN);
+ pool->flags = rxe_type_info[type].flags;
+ pool->tree = RB_ROOT;
+ pool->cleanup = rxe_type_info[type].cleanup;
+
+ atomic_set(&pool->num_elem, 0);
+
+ kref_init(&pool->ref_cnt);
+
+ spin_lock_init(&pool->pool_lock);
+
+ if (rxe_type_info[type].flags & RXE_POOL_INDEX) {
+ err = rxe_pool_init_index(pool,
+ rxe_type_info[type].max_index,
+ rxe_type_info[type].min_index);
+ if (err)
+ goto out;
+ }
+
+ if (rxe_type_info[type].flags & RXE_POOL_KEY) {
+ pool->key_offset = rxe_type_info[type].key_offset;
+ pool->key_size = rxe_type_info[type].key_size;
+ }
+
+ pool->state = rxe_pool_valid;
+
+out:
+ return err;
+}
+
+static void rxe_pool_release(struct kref *kref)
+{
+ struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt);
+
+ pool->state = rxe_pool_invalid;
+ kfree(pool->table);
+}
+
+static void rxe_pool_put(struct rxe_pool *pool)
+{
+ kref_put(&pool->ref_cnt, rxe_pool_release);
+}
+
+int rxe_pool_cleanup(struct rxe_pool *pool)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ pool->state = rxe_pool_invalid;
+ if (atomic_read(&pool->num_elem) > 0)
+ pr_warn("%s pool destroyed with unfree'd elem\n",
+ pool_name(pool));
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ rxe_pool_put(pool);
+
+ return 0;
+}
+
+static u32 alloc_index(struct rxe_pool *pool)
+{
+ u32 index;
+ u32 range = pool->max_index - pool->min_index + 1;
+
+ index = find_next_zero_bit(pool->table, range, pool->last);
+ if (index >= range)
+ index = find_first_zero_bit(pool->table, range);
+
+ set_bit(index, pool->table);
+ pool->last = index;
+ return index + pool->min_index;
+}
+
+static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+ struct rb_node **link = &pool->tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct rxe_pool_entry *elem;
+
+ while (*link) {
+ parent = *link;
+ elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+ if (elem->index == new->index) {
+ pr_warn("element already exists!\n");
+ goto out;
+ }
+
+ if (elem->index > new->index)
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, &pool->tree);
+out:
+ return;
+}
+
+static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new)
+{
+ struct rb_node **link = &pool->tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct rxe_pool_entry *elem;
+ int cmp;
+
+ while (*link) {
+ parent = *link;
+ elem = rb_entry(parent, struct rxe_pool_entry, node);
+
+ cmp = memcmp((u8 *)elem + pool->key_offset,
+ (u8 *)new + pool->key_offset, pool->key_size);
+
+ if (cmp == 0) {
+ pr_warn("key already exists!\n");
+ goto out;
+ }
+
+ if (cmp > 0)
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, &pool->tree);
+out:
+ return;
+}
+
+void rxe_add_key(void *arg, void *key)
+{
+ struct rxe_pool_entry *elem = arg;
+ struct rxe_pool *pool = elem->pool;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ memcpy((u8 *)elem + pool->key_offset, key, pool->key_size);
+ insert_key(pool, elem);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_key(void *arg)
+{
+ struct rxe_pool_entry *elem = arg;
+ struct rxe_pool *pool = elem->pool;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ rb_erase(&elem->node, &pool->tree);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_add_index(void *arg)
+{
+ struct rxe_pool_entry *elem = arg;
+ struct rxe_pool *pool = elem->pool;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ elem->index = alloc_index(pool);
+ insert_index(pool, elem);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void rxe_drop_index(void *arg)
+{
+ struct rxe_pool_entry *elem = arg;
+ struct rxe_pool *pool = elem->pool;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ clear_bit(elem->index - pool->min_index, pool->table);
+ rb_erase(&elem->node, &pool->tree);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+void *rxe_alloc(struct rxe_pool *pool)
+{
+ struct rxe_pool_entry *elem;
+ unsigned long flags;
+
+ might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ if (pool->state != rxe_pool_valid) {
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ return NULL;
+ }
+ kref_get(&pool->ref_cnt);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ kref_get(&pool->rxe->ref_cnt);
+
+ if (atomic_inc_return(&pool->num_elem) > pool->max_elem) {
+ atomic_dec(&pool->num_elem);
+ rxe_dev_put(pool->rxe);
+ rxe_pool_put(pool);
+ return NULL;
+ }
+
+ elem = kmem_cache_zalloc(pool_cache(pool),
+ (pool->flags & RXE_POOL_ATOMIC) ?
+ GFP_ATOMIC : GFP_KERNEL);
+
+ elem->pool = pool;
+ kref_init(&elem->ref_cnt);
+
+ return elem;
+}
+
+void rxe_elem_release(struct kref *kref)
+{
+ struct rxe_pool_entry *elem =
+ container_of(kref, struct rxe_pool_entry, ref_cnt);
+ struct rxe_pool *pool = elem->pool;
+
+ if (pool->cleanup)
+ pool->cleanup(elem);
+
+ kmem_cache_free(pool_cache(pool), elem);
+ atomic_dec(&pool->num_elem);
+ rxe_dev_put(pool->rxe);
+ rxe_pool_put(pool);
+}
+
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
+{
+ struct rb_node *node = NULL;
+ struct rxe_pool_entry *elem = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+
+ if (pool->state != rxe_pool_valid)
+ goto out;
+
+ node = pool->tree.rb_node;
+
+ while (node) {
+ elem = rb_entry(node, struct rxe_pool_entry, node);
+
+ if (elem->index > index)
+ node = node->rb_left;
+ else if (elem->index < index)
+ node = node->rb_right;
+ else
+ break;
+ }
+
+ if (node)
+ kref_get(&elem->ref_cnt);
+
+out:
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ return node ? (void *)elem : NULL;
+}
+
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key)
+{
+ struct rb_node *node = NULL;
+ struct rxe_pool_entry *elem = NULL;
+ int cmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+
+ if (pool->state != rxe_pool_valid)
+ goto out;
+
+ node = pool->tree.rb_node;
+
+ while (node) {
+ elem = rb_entry(node, struct rxe_pool_entry, node);
+
+ cmp = memcmp((u8 *)elem + pool->key_offset,
+ key, pool->key_size);
+
+ if (cmp > 0)
+ node = node->rb_left;
+ else if (cmp < 0)
+ node = node->rb_right;
+ else
+ break;
+ }
+
+ if (node)
+ kref_get(&elem->ref_cnt);
+
+out:
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ return node ? ((void *)elem) : NULL;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
new file mode 100644
index 000000000000..4d04830adcae
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_POOL_H
+#define RXE_POOL_H
+
+#define RXE_POOL_ALIGN (16)
+#define RXE_POOL_CACHE_FLAGS (0)
+
+enum rxe_pool_flags {
+ RXE_POOL_ATOMIC = BIT(0),
+ RXE_POOL_INDEX = BIT(1),
+ RXE_POOL_KEY = BIT(2),
+};
+
+enum rxe_elem_type {
+ RXE_TYPE_UC,
+ RXE_TYPE_PD,
+ RXE_TYPE_AH,
+ RXE_TYPE_SRQ,
+ RXE_TYPE_QP,
+ RXE_TYPE_CQ,
+ RXE_TYPE_MR,
+ RXE_TYPE_MW,
+ RXE_TYPE_MC_GRP,
+ RXE_TYPE_MC_ELEM,
+ RXE_NUM_TYPES, /* keep me last */
+};
+
+struct rxe_type_info {
+ char *name;
+ size_t size;
+ void (*cleanup)(void *obj);
+ enum rxe_pool_flags flags;
+ u32 max_index;
+ u32 min_index;
+ size_t key_offset;
+ size_t key_size;
+ struct kmem_cache *cache;
+};
+
+extern struct rxe_type_info rxe_type_info[];
+
+enum rxe_pool_state {
+ rxe_pool_invalid,
+ rxe_pool_valid,
+};
+
+struct rxe_pool_entry {
+ struct rxe_pool *pool;
+ struct kref ref_cnt;
+ struct list_head list;
+
+ /* only used if indexed or keyed */
+ struct rb_node node;
+ u32 index;
+};
+
+struct rxe_pool {
+ struct rxe_dev *rxe;
+ spinlock_t pool_lock; /* pool spinlock */
+ size_t elem_size;
+ struct kref ref_cnt;
+ void (*cleanup)(void *obj);
+ enum rxe_pool_state state;
+ enum rxe_pool_flags flags;
+ enum rxe_elem_type type;
+
+ unsigned int max_elem;
+ atomic_t num_elem;
+
+ /* only used if indexed or keyed */
+ struct rb_root tree;
+ unsigned long *table;
+ size_t table_size;
+ u32 max_index;
+ u32 min_index;
+ u32 last;
+ size_t key_offset;
+ size_t key_size;
+};
+
+/* initialize slab caches for managed objects */
+int rxe_cache_init(void);
+
+/* cleanup slab caches for managed objects */
+void rxe_cache_exit(void);
+
+/* initialize a pool of objects with given limit on
+ * number of elements. gets parameters from rxe_type_info
+ * pool elements will be allocated out of a slab cache
+ */
+int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool,
+ enum rxe_elem_type type, u32 max_elem);
+
+/* free resources from object pool */
+int rxe_pool_cleanup(struct rxe_pool *pool);
+
+/* allocate an object from pool */
+void *rxe_alloc(struct rxe_pool *pool);
+
+/* assign an index to an indexed object and insert object into
+ * pool's rb tree
+ */
+void rxe_add_index(void *elem);
+
+/* drop an index and remove object from rb tree */
+void rxe_drop_index(void *elem);
+
+/* assign a key to a keyed object and insert object into
+ * pool's rb tree
+ */
+void rxe_add_key(void *elem, void *key);
+
+/* remove elem from rb tree */
+void rxe_drop_key(void *elem);
+
+/* lookup an indexed object from index. takes a reference on object */
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index);
+
+/* lookup keyed object from key. takes a reference on the object */
+void *rxe_pool_get_key(struct rxe_pool *pool, void *key);
+
+/* cleanup an object when all references are dropped */
+void rxe_elem_release(struct kref *kref);
+
+/* take a reference on an object */
+#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt)
+
+/* drop a reference on an object */
+#define rxe_drop_ref(elem) kref_put(&(elem)->pelem.ref_cnt, rxe_elem_release)
+
+#endif /* RXE_POOL_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
new file mode 100644
index 000000000000..22ba24f2a2c1
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+char *rxe_qp_state_name[] = {
+ [QP_STATE_RESET] = "RESET",
+ [QP_STATE_INIT] = "INIT",
+ [QP_STATE_READY] = "READY",
+ [QP_STATE_DRAIN] = "DRAIN",
+ [QP_STATE_DRAINED] = "DRAINED",
+ [QP_STATE_ERROR] = "ERROR",
+};
+
+static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap,
+ int has_srq)
+{
+ if (cap->max_send_wr > rxe->attr.max_qp_wr) {
+ pr_warn("invalid send wr = %d > %d\n",
+ cap->max_send_wr, rxe->attr.max_qp_wr);
+ goto err1;
+ }
+
+ if (cap->max_send_sge > rxe->attr.max_sge) {
+ pr_warn("invalid send sge = %d > %d\n",
+ cap->max_send_sge, rxe->attr.max_sge);
+ goto err1;
+ }
+
+ if (!has_srq) {
+ if (cap->max_recv_wr > rxe->attr.max_qp_wr) {
+ pr_warn("invalid recv wr = %d > %d\n",
+ cap->max_recv_wr, rxe->attr.max_qp_wr);
+ goto err1;
+ }
+
+ if (cap->max_recv_sge > rxe->attr.max_sge) {
+ pr_warn("invalid recv sge = %d > %d\n",
+ cap->max_recv_sge, rxe->attr.max_sge);
+ goto err1;
+ }
+ }
+
+ if (cap->max_inline_data > rxe->max_inline_data) {
+ pr_warn("invalid max inline data = %d > %d\n",
+ cap->max_inline_data, rxe->max_inline_data);
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init)
+{
+ struct ib_qp_cap *cap = &init->cap;
+ struct rxe_port *port;
+ int port_num = init->port_num;
+
+ if (!init->recv_cq || !init->send_cq) {
+ pr_warn("missing cq\n");
+ goto err1;
+ }
+
+ if (rxe_qp_chk_cap(rxe, cap, !!init->srq))
+ goto err1;
+
+ if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) {
+ if (port_num != 1) {
+ pr_warn("invalid port = %d\n", port_num);
+ goto err1;
+ }
+
+ port = &rxe->port;
+
+ if (init->qp_type == IB_QPT_SMI && port->qp_smi_index) {
+ pr_warn("SMI QP exists for port %d\n", port_num);
+ goto err1;
+ }
+
+ if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) {
+ pr_warn("GSI QP exists for port %d\n", port_num);
+ goto err1;
+ }
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n)
+{
+ qp->resp.res_head = 0;
+ qp->resp.res_tail = 0;
+ qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL);
+
+ if (!qp->resp.resources)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void free_rd_atomic_resources(struct rxe_qp *qp)
+{
+ if (qp->resp.resources) {
+ int i;
+
+ for (i = 0; i < qp->attr.max_rd_atomic; i++) {
+ struct resp_res *res = &qp->resp.resources[i];
+
+ free_rd_atomic_resource(qp, res);
+ }
+ kfree(qp->resp.resources);
+ qp->resp.resources = NULL;
+ }
+}
+
+void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res)
+{
+ if (res->type == RXE_ATOMIC_MASK) {
+ rxe_drop_ref(qp);
+ kfree_skb(res->atomic.skb);
+ } else if (res->type == RXE_READ_MASK) {
+ if (res->read.mr)
+ rxe_drop_ref(res->read.mr);
+ }
+ res->type = 0;
+}
+
+static void cleanup_rd_atomic_resources(struct rxe_qp *qp)
+{
+ int i;
+ struct resp_res *res;
+
+ if (qp->resp.resources) {
+ for (i = 0; i < qp->attr.max_rd_atomic; i++) {
+ res = &qp->resp.resources[i];
+ free_rd_atomic_resource(qp, res);
+ }
+ }
+}
+
+static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_init_attr *init)
+{
+ struct rxe_port *port;
+ u32 qpn;
+
+ qp->sq_sig_type = init->sq_sig_type;
+ qp->attr.path_mtu = 1;
+ qp->mtu = ib_mtu_enum_to_int(qp->attr.path_mtu);
+
+ qpn = qp->pelem.index;
+ port = &rxe->port;
+
+ switch (init->qp_type) {
+ case IB_QPT_SMI:
+ qp->ibqp.qp_num = 0;
+ port->qp_smi_index = qpn;
+ qp->attr.port_num = init->port_num;
+ break;
+
+ case IB_QPT_GSI:
+ qp->ibqp.qp_num = 1;
+ port->qp_gsi_index = qpn;
+ qp->attr.port_num = init->port_num;
+ break;
+
+ default:
+ qp->ibqp.qp_num = qpn;
+ break;
+ }
+
+ INIT_LIST_HEAD(&qp->grp_list);
+
+ skb_queue_head_init(&qp->send_pkts);
+
+ spin_lock_init(&qp->grp_lock);
+ spin_lock_init(&qp->state_lock);
+
+ atomic_set(&qp->ssn, 0);
+ atomic_set(&qp->skb_out, 0);
+}
+
+static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_init_attr *init,
+ struct ib_ucontext *context, struct ib_udata *udata)
+{
+ int err;
+ int wqe_size;
+
+ err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
+ if (err < 0)
+ return err;
+ qp->sk->sk->sk_user_data = qp;
+
+ qp->sq.max_wr = init->cap.max_send_wr;
+ qp->sq.max_sge = init->cap.max_send_sge;
+ qp->sq.max_inline = init->cap.max_inline_data;
+
+ wqe_size = max_t(int, sizeof(struct rxe_send_wqe) +
+ qp->sq.max_sge * sizeof(struct ib_sge),
+ sizeof(struct rxe_send_wqe) +
+ qp->sq.max_inline);
+
+ qp->sq.queue = rxe_queue_init(rxe,
+ &qp->sq.max_wr,
+ wqe_size);
+ if (!qp->sq.queue)
+ return -ENOMEM;
+
+ err = do_mmap_info(rxe, udata, true,
+ context, qp->sq.queue->buf,
+ qp->sq.queue->buf_size, &qp->sq.queue->ip);
+
+ if (err) {
+ kvfree(qp->sq.queue->buf);
+ kfree(qp->sq.queue);
+ return err;
+ }
+
+ qp->req.wqe_index = producer_index(qp->sq.queue);
+ qp->req.state = QP_STATE_RESET;
+ qp->req.opcode = -1;
+ qp->comp.opcode = -1;
+
+ spin_lock_init(&qp->sq.sq_lock);
+ skb_queue_head_init(&qp->req_pkts);
+
+ rxe_init_task(rxe, &qp->req.task, qp,
+ rxe_requester, "req");
+ rxe_init_task(rxe, &qp->comp.task, qp,
+ rxe_completer, "comp");
+
+ init_timer(&qp->rnr_nak_timer);
+ qp->rnr_nak_timer.function = rnr_nak_timer;
+ qp->rnr_nak_timer.data = (unsigned long)qp;
+
+ init_timer(&qp->retrans_timer);
+ qp->retrans_timer.function = retransmit_timer;
+ qp->retrans_timer.data = (unsigned long)qp;
+ qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
+
+ return 0;
+}
+
+static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_init_attr *init,
+ struct ib_ucontext *context, struct ib_udata *udata)
+{
+ int err;
+ int wqe_size;
+
+ if (!qp->srq) {
+ qp->rq.max_wr = init->cap.max_recv_wr;
+ qp->rq.max_sge = init->cap.max_recv_sge;
+
+ wqe_size = rcv_wqe_size(qp->rq.max_sge);
+
+ pr_debug("max_wr = %d, max_sge = %d, wqe_size = %d\n",
+ qp->rq.max_wr, qp->rq.max_sge, wqe_size);
+
+ qp->rq.queue = rxe_queue_init(rxe,
+ &qp->rq.max_wr,
+ wqe_size);
+ if (!qp->rq.queue)
+ return -ENOMEM;
+
+ err = do_mmap_info(rxe, udata, false, context,
+ qp->rq.queue->buf,
+ qp->rq.queue->buf_size,
+ &qp->rq.queue->ip);
+ if (err) {
+ kvfree(qp->rq.queue->buf);
+ kfree(qp->rq.queue);
+ return err;
+ }
+ }
+
+ spin_lock_init(&qp->rq.producer_lock);
+ spin_lock_init(&qp->rq.consumer_lock);
+
+ skb_queue_head_init(&qp->resp_pkts);
+
+ rxe_init_task(rxe, &qp->resp.task, qp,
+ rxe_responder, "resp");
+
+ qp->resp.opcode = OPCODE_NONE;
+ qp->resp.msn = 0;
+ qp->resp.state = QP_STATE_RESET;
+
+ return 0;
+}
+
+/* called by the create qp verb */
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+ struct ib_qp_init_attr *init, struct ib_udata *udata,
+ struct ib_pd *ibpd)
+{
+ int err;
+ struct rxe_cq *rcq = to_rcq(init->recv_cq);
+ struct rxe_cq *scq = to_rcq(init->send_cq);
+ struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL;
+ struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
+
+ rxe_add_ref(pd);
+ rxe_add_ref(rcq);
+ rxe_add_ref(scq);
+ if (srq)
+ rxe_add_ref(srq);
+
+ qp->pd = pd;
+ qp->rcq = rcq;
+ qp->scq = scq;
+ qp->srq = srq;
+
+ rxe_qp_init_misc(rxe, qp, init);
+
+ err = rxe_qp_init_req(rxe, qp, init, context, udata);
+ if (err)
+ goto err1;
+
+ err = rxe_qp_init_resp(rxe, qp, init, context, udata);
+ if (err)
+ goto err2;
+
+ qp->attr.qp_state = IB_QPS_RESET;
+ qp->valid = 1;
+
+ return 0;
+
+err2:
+ rxe_queue_cleanup(qp->sq.queue);
+err1:
+ if (srq)
+ rxe_drop_ref(srq);
+ rxe_drop_ref(scq);
+ rxe_drop_ref(rcq);
+ rxe_drop_ref(pd);
+
+ return err;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init)
+{
+ init->event_handler = qp->ibqp.event_handler;
+ init->qp_context = qp->ibqp.qp_context;
+ init->send_cq = qp->ibqp.send_cq;
+ init->recv_cq = qp->ibqp.recv_cq;
+ init->srq = qp->ibqp.srq;
+
+ init->cap.max_send_wr = qp->sq.max_wr;
+ init->cap.max_send_sge = qp->sq.max_sge;
+ init->cap.max_inline_data = qp->sq.max_inline;
+
+ if (!qp->srq) {
+ init->cap.max_recv_wr = qp->rq.max_wr;
+ init->cap.max_recv_sge = qp->rq.max_sge;
+ }
+
+ init->sq_sig_type = qp->sq_sig_type;
+
+ init->qp_type = qp->ibqp.qp_type;
+ init->port_num = 1;
+
+ return 0;
+}
+
+/* called by the modify qp verb, this routine checks all the parameters before
+ * making any changes
+ */
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_attr *attr, int mask)
+{
+ enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ?
+ attr->cur_qp_state : qp->attr.qp_state;
+ enum ib_qp_state new_state = (mask & IB_QP_STATE) ?
+ attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask,
+ IB_LINK_LAYER_ETHERNET)) {
+ pr_warn("invalid mask or state for qp\n");
+ goto err1;
+ }
+
+ if (mask & IB_QP_STATE) {
+ if (cur_state == IB_QPS_SQD) {
+ if (qp->req.state == QP_STATE_DRAIN &&
+ new_state != IB_QPS_ERR)
+ goto err1;
+ }
+ }
+
+ if (mask & IB_QP_PORT) {
+ if (attr->port_num != 1) {
+ pr_warn("invalid port %d\n", attr->port_num);
+ goto err1;
+ }
+ }
+
+ if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq))
+ goto err1;
+
+ if (mask & IB_QP_AV && rxe_av_chk_attr(rxe, &attr->ah_attr))
+ goto err1;
+
+ if (mask & IB_QP_ALT_PATH) {
+ if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr))
+ goto err1;
+ if (attr->alt_port_num != 1) {
+ pr_warn("invalid alt port %d\n", attr->alt_port_num);
+ goto err1;
+ }
+ if (attr->alt_timeout > 31) {
+ pr_warn("invalid QP alt timeout %d > 31\n",
+ attr->alt_timeout);
+ goto err1;
+ }
+ }
+
+ if (mask & IB_QP_PATH_MTU) {
+ struct rxe_port *port = &rxe->port;
+
+ enum ib_mtu max_mtu = port->attr.max_mtu;
+ enum ib_mtu mtu = attr->path_mtu;
+
+ if (mtu > max_mtu) {
+ pr_debug("invalid mtu (%d) > (%d)\n",
+ ib_mtu_enum_to_int(mtu),
+ ib_mtu_enum_to_int(max_mtu));
+ goto err1;
+ }
+ }
+
+ if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) {
+ pr_warn("invalid max_rd_atomic %d > %d\n",
+ attr->max_rd_atomic,
+ rxe->attr.max_qp_rd_atom);
+ goto err1;
+ }
+ }
+
+ if (mask & IB_QP_TIMEOUT) {
+ if (attr->timeout > 31) {
+ pr_warn("invalid QP timeout %d > 31\n",
+ attr->timeout);
+ goto err1;
+ }
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+/* move the qp to the reset state */
+static void rxe_qp_reset(struct rxe_qp *qp)
+{
+ /* stop tasks from running */
+ rxe_disable_task(&qp->resp.task);
+
+ /* stop request/comp */
+ if (qp->sq.queue) {
+ if (qp_type(qp) == IB_QPT_RC)
+ rxe_disable_task(&qp->comp.task);
+ rxe_disable_task(&qp->req.task);
+ }
+
+ /* move qp to the reset state */
+ qp->req.state = QP_STATE_RESET;
+ qp->resp.state = QP_STATE_RESET;
+
+ /* let state machines reset themselves drain work and packet queues
+ * etc.
+ */
+ __rxe_do_task(&qp->resp.task);
+
+ if (qp->sq.queue) {
+ __rxe_do_task(&qp->comp.task);
+ __rxe_do_task(&qp->req.task);
+ }
+
+ /* cleanup attributes */
+ atomic_set(&qp->ssn, 0);
+ qp->req.opcode = -1;
+ qp->req.need_retry = 0;
+ qp->req.noack_pkts = 0;
+ qp->resp.msn = 0;
+ qp->resp.opcode = -1;
+ qp->resp.drop_msg = 0;
+ qp->resp.goto_error = 0;
+ qp->resp.sent_psn_nak = 0;
+
+ if (qp->resp.mr) {
+ rxe_drop_ref(qp->resp.mr);
+ qp->resp.mr = NULL;
+ }
+
+ cleanup_rd_atomic_resources(qp);
+
+ /* reenable tasks */
+ rxe_enable_task(&qp->resp.task);
+
+ if (qp->sq.queue) {
+ if (qp_type(qp) == IB_QPT_RC)
+ rxe_enable_task(&qp->comp.task);
+
+ rxe_enable_task(&qp->req.task);
+ }
+}
+
+/* drain the send queue */
+static void rxe_qp_drain(struct rxe_qp *qp)
+{
+ if (qp->sq.queue) {
+ if (qp->req.state != QP_STATE_DRAINED) {
+ qp->req.state = QP_STATE_DRAIN;
+ if (qp_type(qp) == IB_QPT_RC)
+ rxe_run_task(&qp->comp.task, 1);
+ else
+ __rxe_do_task(&qp->comp.task);
+ rxe_run_task(&qp->req.task, 1);
+ }
+ }
+}
+
+/* move the qp to the error state */
+void rxe_qp_error(struct rxe_qp *qp)
+{
+ qp->req.state = QP_STATE_ERROR;
+ qp->resp.state = QP_STATE_ERROR;
+
+ /* drain work and packet queues */
+ rxe_run_task(&qp->resp.task, 1);
+
+ if (qp_type(qp) == IB_QPT_RC)
+ rxe_run_task(&qp->comp.task, 1);
+ else
+ __rxe_do_task(&qp->comp.task);
+ rxe_run_task(&qp->req.task, 1);
+}
+
+/* called by the modify qp verb */
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
+ struct ib_udata *udata)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ union ib_gid sgid;
+ struct ib_gid_attr sgid_attr;
+
+ if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ int max_rd_atomic = __roundup_pow_of_two(attr->max_rd_atomic);
+
+ free_rd_atomic_resources(qp);
+
+ err = alloc_rd_atomic_resources(qp, max_rd_atomic);
+ if (err)
+ return err;
+
+ qp->attr.max_rd_atomic = max_rd_atomic;
+ atomic_set(&qp->req.rd_atomic, max_rd_atomic);
+ }
+
+ if (mask & IB_QP_CUR_STATE)
+ qp->attr.cur_qp_state = attr->qp_state;
+
+ if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
+ qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify;
+
+ if (mask & IB_QP_ACCESS_FLAGS)
+ qp->attr.qp_access_flags = attr->qp_access_flags;
+
+ if (mask & IB_QP_PKEY_INDEX)
+ qp->attr.pkey_index = attr->pkey_index;
+
+ if (mask & IB_QP_PORT)
+ qp->attr.port_num = attr->port_num;
+
+ if (mask & IB_QP_QKEY)
+ qp->attr.qkey = attr->qkey;
+
+ if (mask & IB_QP_AV) {
+ ib_get_cached_gid(&rxe->ib_dev, 1,
+ attr->ah_attr.grh.sgid_index, &sgid,
+ &sgid_attr);
+ rxe_av_from_attr(rxe, attr->port_num, &qp->pri_av,
+ &attr->ah_attr);
+ rxe_av_fill_ip_info(rxe, &qp->pri_av, &attr->ah_attr,
+ &sgid_attr, &sgid);
+ if (sgid_attr.ndev)
+ dev_put(sgid_attr.ndev);
+ }
+
+ if (mask & IB_QP_ALT_PATH) {
+ ib_get_cached_gid(&rxe->ib_dev, 1,
+ attr->alt_ah_attr.grh.sgid_index, &sgid,
+ &sgid_attr);
+
+ rxe_av_from_attr(rxe, attr->alt_port_num, &qp->alt_av,
+ &attr->alt_ah_attr);
+ rxe_av_fill_ip_info(rxe, &qp->alt_av, &attr->alt_ah_attr,
+ &sgid_attr, &sgid);
+ if (sgid_attr.ndev)
+ dev_put(sgid_attr.ndev);
+
+ qp->attr.alt_port_num = attr->alt_port_num;
+ qp->attr.alt_pkey_index = attr->alt_pkey_index;
+ qp->attr.alt_timeout = attr->alt_timeout;
+ }
+
+ if (mask & IB_QP_PATH_MTU) {
+ qp->attr.path_mtu = attr->path_mtu;
+ qp->mtu = ib_mtu_enum_to_int(attr->path_mtu);
+ }
+
+ if (mask & IB_QP_TIMEOUT) {
+ qp->attr.timeout = attr->timeout;
+ if (attr->timeout == 0) {
+ qp->qp_timeout_jiffies = 0;
+ } else {
+ /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */
+ int j = nsecs_to_jiffies(4096ULL << attr->timeout);
+
+ qp->qp_timeout_jiffies = j ? j : 1;
+ }
+ }
+
+ if (mask & IB_QP_RETRY_CNT) {
+ qp->attr.retry_cnt = attr->retry_cnt;
+ qp->comp.retry_cnt = attr->retry_cnt;
+ pr_debug("set retry count = %d\n", attr->retry_cnt);
+ }
+
+ if (mask & IB_QP_RNR_RETRY) {
+ qp->attr.rnr_retry = attr->rnr_retry;
+ qp->comp.rnr_retry = attr->rnr_retry;
+ pr_debug("set rnr retry count = %d\n", attr->rnr_retry);
+ }
+
+ if (mask & IB_QP_RQ_PSN) {
+ qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK);
+ qp->resp.psn = qp->attr.rq_psn;
+ pr_debug("set resp psn = 0x%x\n", qp->resp.psn);
+ }
+
+ if (mask & IB_QP_MIN_RNR_TIMER) {
+ qp->attr.min_rnr_timer = attr->min_rnr_timer;
+ pr_debug("set min rnr timer = 0x%x\n",
+ attr->min_rnr_timer);
+ }
+
+ if (mask & IB_QP_SQ_PSN) {
+ qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK);
+ qp->req.psn = qp->attr.sq_psn;
+ qp->comp.psn = qp->attr.sq_psn;
+ pr_debug("set req psn = 0x%x\n", qp->req.psn);
+ }
+
+ if (mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ qp->attr.max_dest_rd_atomic =
+ __roundup_pow_of_two(attr->max_dest_rd_atomic);
+ }
+
+ if (mask & IB_QP_PATH_MIG_STATE)
+ qp->attr.path_mig_state = attr->path_mig_state;
+
+ if (mask & IB_QP_DEST_QPN)
+ qp->attr.dest_qp_num = attr->dest_qp_num;
+
+ if (mask & IB_QP_STATE) {
+ qp->attr.qp_state = attr->qp_state;
+
+ switch (attr->qp_state) {
+ case IB_QPS_RESET:
+ pr_debug("qp state -> RESET\n");
+ rxe_qp_reset(qp);
+ break;
+
+ case IB_QPS_INIT:
+ pr_debug("qp state -> INIT\n");
+ qp->req.state = QP_STATE_INIT;
+ qp->resp.state = QP_STATE_INIT;
+ break;
+
+ case IB_QPS_RTR:
+ pr_debug("qp state -> RTR\n");
+ qp->resp.state = QP_STATE_READY;
+ break;
+
+ case IB_QPS_RTS:
+ pr_debug("qp state -> RTS\n");
+ qp->req.state = QP_STATE_READY;
+ break;
+
+ case IB_QPS_SQD:
+ pr_debug("qp state -> SQD\n");
+ rxe_qp_drain(qp);
+ break;
+
+ case IB_QPS_SQE:
+ pr_warn("qp state -> SQE !!?\n");
+ /* Not possible from modify_qp. */
+ break;
+
+ case IB_QPS_ERR:
+ pr_debug("qp state -> ERR\n");
+ rxe_qp_error(qp);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+ *attr = qp->attr;
+
+ attr->rq_psn = qp->resp.psn;
+ attr->sq_psn = qp->req.psn;
+
+ attr->cap.max_send_wr = qp->sq.max_wr;
+ attr->cap.max_send_sge = qp->sq.max_sge;
+ attr->cap.max_inline_data = qp->sq.max_inline;
+
+ if (!qp->srq) {
+ attr->cap.max_recv_wr = qp->rq.max_wr;
+ attr->cap.max_recv_sge = qp->rq.max_sge;
+ }
+
+ rxe_av_to_attr(rxe, &qp->pri_av, &attr->ah_attr);
+ rxe_av_to_attr(rxe, &qp->alt_av, &attr->alt_ah_attr);
+
+ if (qp->req.state == QP_STATE_DRAIN) {
+ attr->sq_draining = 1;
+ /* applications that get this state
+ * typically spin on it. yield the
+ * processor
+ */
+ cond_resched();
+ } else {
+ attr->sq_draining = 0;
+ }
+
+ pr_debug("attr->sq_draining = %d\n", attr->sq_draining);
+
+ return 0;
+}
+
+/* called by the destroy qp verb */
+void rxe_qp_destroy(struct rxe_qp *qp)
+{
+ qp->valid = 0;
+ qp->qp_timeout_jiffies = 0;
+ rxe_cleanup_task(&qp->resp.task);
+
+ del_timer_sync(&qp->retrans_timer);
+ del_timer_sync(&qp->rnr_nak_timer);
+
+ rxe_cleanup_task(&qp->req.task);
+ if (qp_type(qp) == IB_QPT_RC)
+ rxe_cleanup_task(&qp->comp.task);
+
+ /* flush out any receive wr's or pending requests */
+ __rxe_do_task(&qp->req.task);
+ if (qp->sq.queue) {
+ __rxe_do_task(&qp->comp.task);
+ __rxe_do_task(&qp->req.task);
+ }
+}
+
+/* called when the last reference to the qp is dropped */
+void rxe_qp_cleanup(void *arg)
+{
+ struct rxe_qp *qp = arg;
+
+ rxe_drop_all_mcast_groups(qp);
+
+ if (qp->sq.queue)
+ rxe_queue_cleanup(qp->sq.queue);
+
+ if (qp->srq)
+ rxe_drop_ref(qp->srq);
+
+ if (qp->rq.queue)
+ rxe_queue_cleanup(qp->rq.queue);
+
+ if (qp->scq)
+ rxe_drop_ref(qp->scq);
+ if (qp->rcq)
+ rxe_drop_ref(qp->rcq);
+ if (qp->pd)
+ rxe_drop_ref(qp->pd);
+
+ if (qp->resp.mr) {
+ rxe_drop_ref(qp->resp.mr);
+ qp->resp.mr = NULL;
+ }
+
+ free_rd_atomic_resources(qp);
+
+ kernel_sock_shutdown(qp->sk, SHUT_RDWR);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c
new file mode 100644
index 000000000000..08274254eb88
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_queue.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must retailuce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/vmalloc.h>
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int do_mmap_info(struct rxe_dev *rxe,
+ struct ib_udata *udata,
+ bool is_req,
+ struct ib_ucontext *context,
+ struct rxe_queue_buf *buf,
+ size_t buf_size,
+ struct rxe_mmap_info **ip_p)
+{
+ int err;
+ u32 len, offset;
+ struct rxe_mmap_info *ip = NULL;
+
+ if (udata) {
+ if (is_req) {
+ len = udata->outlen - sizeof(struct mminfo);
+ offset = sizeof(struct mminfo);
+ } else {
+ len = udata->outlen;
+ offset = 0;
+ }
+
+ if (len < sizeof(ip->info))
+ goto err1;
+
+ ip = rxe_create_mmap_info(rxe, buf_size, context, buf);
+ if (!ip)
+ goto err1;
+
+ err = copy_to_user(udata->outbuf + offset, &ip->info,
+ sizeof(ip->info));
+ if (err)
+ goto err2;
+
+ spin_lock_bh(&rxe->pending_lock);
+ list_add(&ip->pending_mmaps, &rxe->pending_mmaps);
+ spin_unlock_bh(&rxe->pending_lock);
+ }
+
+ *ip_p = ip;
+
+ return 0;
+
+err2:
+ kfree(ip);
+err1:
+ return -EINVAL;
+}
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
+ int *num_elem,
+ unsigned int elem_size)
+{
+ struct rxe_queue *q;
+ size_t buf_size;
+ unsigned int num_slots;
+
+ /* num_elem == 0 is allowed, but uninteresting */
+ if (*num_elem < 0)
+ goto err1;
+
+ q = kmalloc(sizeof(*q), GFP_KERNEL);
+ if (!q)
+ goto err1;
+
+ q->rxe = rxe;
+
+ /* used in resize, only need to copy used part of queue */
+ q->elem_size = elem_size;
+
+ /* pad element up to at least a cacheline and always a power of 2 */
+ if (elem_size < cache_line_size())
+ elem_size = cache_line_size();
+ elem_size = roundup_pow_of_two(elem_size);
+
+ q->log2_elem_size = order_base_2(elem_size);
+
+ num_slots = *num_elem + 1;
+ num_slots = roundup_pow_of_two(num_slots);
+ q->index_mask = num_slots - 1;
+
+ buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size;
+
+ q->buf = vmalloc_user(buf_size);
+ if (!q->buf)
+ goto err2;
+
+ q->buf->log2_elem_size = q->log2_elem_size;
+ q->buf->index_mask = q->index_mask;
+
+ q->buf_size = buf_size;
+
+ *num_elem = num_slots - 1;
+ return q;
+
+err2:
+ kfree(q);
+err1:
+ return NULL;
+}
+
+/* copies elements from original q to new q and then swaps the contents of the
+ * two q headers. This is so that if anyone is holding a pointer to q it will
+ * still work
+ */
+static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q,
+ unsigned int num_elem)
+{
+ if (!queue_empty(q) && (num_elem < queue_count(q)))
+ return -EINVAL;
+
+ while (!queue_empty(q)) {
+ memcpy(producer_addr(new_q), consumer_addr(q),
+ new_q->elem_size);
+ advance_producer(new_q);
+ advance_consumer(q);
+ }
+
+ swap(*q, *new_q);
+
+ return 0;
+}
+
+int rxe_queue_resize(struct rxe_queue *q,
+ unsigned int *num_elem_p,
+ unsigned int elem_size,
+ struct ib_ucontext *context,
+ struct ib_udata *udata,
+ spinlock_t *producer_lock,
+ spinlock_t *consumer_lock)
+{
+ struct rxe_queue *new_q;
+ unsigned int num_elem = *num_elem_p;
+ int err;
+ unsigned long flags = 0, flags1;
+
+ new_q = rxe_queue_init(q->rxe, &num_elem, elem_size);
+ if (!new_q)
+ return -ENOMEM;
+
+ err = do_mmap_info(new_q->rxe, udata, false, context, new_q->buf,
+ new_q->buf_size, &new_q->ip);
+ if (err) {
+ vfree(new_q->buf);
+ kfree(new_q);
+ goto err1;
+ }
+
+ spin_lock_irqsave(consumer_lock, flags1);
+
+ if (producer_lock) {
+ spin_lock_irqsave(producer_lock, flags);
+ err = resize_finish(q, new_q, num_elem);
+ spin_unlock_irqrestore(producer_lock, flags);
+ } else {
+ err = resize_finish(q, new_q, num_elem);
+ }
+
+ spin_unlock_irqrestore(consumer_lock, flags1);
+
+ rxe_queue_cleanup(new_q); /* new/old dep on err */
+ if (err)
+ goto err1;
+
+ *num_elem_p = num_elem;
+ return 0;
+
+err1:
+ return err;
+}
+
+void rxe_queue_cleanup(struct rxe_queue *q)
+{
+ if (q->ip)
+ kref_put(&q->ip->ref, rxe_mmap_release);
+ else
+ vfree(q->buf);
+
+ kfree(q);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h
new file mode 100644
index 000000000000..239fd609c31e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_queue.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_QUEUE_H
+#define RXE_QUEUE_H
+
+/* implements a simple circular buffer that can optionally be
+ * shared between user space and the kernel and can be resized
+
+ * the requested element size is rounded up to a power of 2
+ * and the number of elements in the buffer is also rounded
+ * up to a power of 2. Since the queue is empty when the
+ * producer and consumer indices match the maximum capacity
+ * of the queue is one less than the number of element slots
+ */
+
+/* this data structure is shared between user space and kernel
+ * space for those cases where the queue is shared. It contains
+ * the producer and consumer indices. Is also contains a copy
+ * of the queue size parameters for user space to use but the
+ * kernel must use the parameters in the rxe_queue struct
+ * this MUST MATCH the corresponding librxe struct
+ * for performance reasons arrange to have producer and consumer
+ * pointers in separate cache lines
+ * the kernel should always mask the indices to avoid accessing
+ * memory outside of the data area
+ */
+struct rxe_queue_buf {
+ __u32 log2_elem_size;
+ __u32 index_mask;
+ __u32 pad_1[30];
+ __u32 producer_index;
+ __u32 pad_2[31];
+ __u32 consumer_index;
+ __u32 pad_3[31];
+ __u8 data[0];
+};
+
+struct rxe_queue {
+ struct rxe_dev *rxe;
+ struct rxe_queue_buf *buf;
+ struct rxe_mmap_info *ip;
+ size_t buf_size;
+ size_t elem_size;
+ unsigned int log2_elem_size;
+ unsigned int index_mask;
+};
+
+int do_mmap_info(struct rxe_dev *rxe,
+ struct ib_udata *udata,
+ bool is_req,
+ struct ib_ucontext *context,
+ struct rxe_queue_buf *buf,
+ size_t buf_size,
+ struct rxe_mmap_info **ip_p);
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
+ int *num_elem,
+ unsigned int elem_size);
+
+int rxe_queue_resize(struct rxe_queue *q,
+ unsigned int *num_elem_p,
+ unsigned int elem_size,
+ struct ib_ucontext *context,
+ struct ib_udata *udata,
+ /* Protect producers while resizing queue */
+ spinlock_t *producer_lock,
+ /* Protect consumers while resizing queue */
+ spinlock_t *consumer_lock);
+
+void rxe_queue_cleanup(struct rxe_queue *queue);
+
+static inline int next_index(struct rxe_queue *q, int index)
+{
+ return (index + 1) & q->buf->index_mask;
+}
+
+static inline int queue_empty(struct rxe_queue *q)
+{
+ return ((q->buf->producer_index - q->buf->consumer_index)
+ & q->index_mask) == 0;
+}
+
+static inline int queue_full(struct rxe_queue *q)
+{
+ return ((q->buf->producer_index + 1 - q->buf->consumer_index)
+ & q->index_mask) == 0;
+}
+
+static inline void advance_producer(struct rxe_queue *q)
+{
+ q->buf->producer_index = (q->buf->producer_index + 1)
+ & q->index_mask;
+}
+
+static inline void advance_consumer(struct rxe_queue *q)
+{
+ q->buf->consumer_index = (q->buf->consumer_index + 1)
+ & q->index_mask;
+}
+
+static inline void *producer_addr(struct rxe_queue *q)
+{
+ return q->buf->data + ((q->buf->producer_index & q->index_mask)
+ << q->log2_elem_size);
+}
+
+static inline void *consumer_addr(struct rxe_queue *q)
+{
+ return q->buf->data + ((q->buf->consumer_index & q->index_mask)
+ << q->log2_elem_size);
+}
+
+static inline unsigned int producer_index(struct rxe_queue *q)
+{
+ return q->buf->producer_index;
+}
+
+static inline unsigned int consumer_index(struct rxe_queue *q)
+{
+ return q->buf->consumer_index;
+}
+
+static inline void *addr_from_index(struct rxe_queue *q, unsigned int index)
+{
+ return q->buf->data + ((index & q->index_mask)
+ << q->buf->log2_elem_size);
+}
+
+static inline unsigned int index_from_addr(const struct rxe_queue *q,
+ const void *addr)
+{
+ return (((u8 *)addr - q->buf->data) >> q->log2_elem_size)
+ & q->index_mask;
+}
+
+static inline unsigned int queue_count(const struct rxe_queue *q)
+{
+ return (q->buf->producer_index - q->buf->consumer_index)
+ & q->index_mask;
+}
+
+static inline void *queue_head(struct rxe_queue *q)
+{
+ return queue_empty(q) ? NULL : consumer_addr(q);
+}
+
+#endif /* RXE_QUEUE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
new file mode 100644
index 000000000000..144d2f129fcd
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_recv.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ struct rxe_qp *qp)
+{
+ if (unlikely(!qp->valid))
+ goto err1;
+
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ if (unlikely((pkt->opcode & IB_OPCODE_RC) != 0)) {
+ pr_warn_ratelimited("bad qp type\n");
+ goto err1;
+ }
+ break;
+ case IB_QPT_UC:
+ if (unlikely(!(pkt->opcode & IB_OPCODE_UC))) {
+ pr_warn_ratelimited("bad qp type\n");
+ goto err1;
+ }
+ break;
+ case IB_QPT_UD:
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ if (unlikely(!(pkt->opcode & IB_OPCODE_UD))) {
+ pr_warn_ratelimited("bad qp type\n");
+ goto err1;
+ }
+ break;
+ default:
+ pr_warn_ratelimited("unsupported qp type\n");
+ goto err1;
+ }
+
+ if (pkt->mask & RXE_REQ_MASK) {
+ if (unlikely(qp->resp.state != QP_STATE_READY))
+ goto err1;
+ } else if (unlikely(qp->req.state < QP_STATE_READY ||
+ qp->req.state > QP_STATE_DRAINED)) {
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static void set_bad_pkey_cntr(struct rxe_port *port)
+{
+ spin_lock_bh(&port->port_lock);
+ port->attr.bad_pkey_cntr = min((u32)0xffff,
+ port->attr.bad_pkey_cntr + 1);
+ spin_unlock_bh(&port->port_lock);
+}
+
+static void set_qkey_viol_cntr(struct rxe_port *port)
+{
+ spin_lock_bh(&port->port_lock);
+ port->attr.qkey_viol_cntr = min((u32)0xffff,
+ port->attr.qkey_viol_cntr + 1);
+ spin_unlock_bh(&port->port_lock);
+}
+
+static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ u32 qpn, struct rxe_qp *qp)
+{
+ int i;
+ int found_pkey = 0;
+ struct rxe_port *port = &rxe->port;
+ u16 pkey = bth_pkey(pkt);
+
+ pkt->pkey_index = 0;
+
+ if (qpn == 1) {
+ for (i = 0; i < port->attr.pkey_tbl_len; i++) {
+ if (pkey_match(pkey, port->pkey_tbl[i])) {
+ pkt->pkey_index = i;
+ found_pkey = 1;
+ break;
+ }
+ }
+
+ if (!found_pkey) {
+ pr_warn_ratelimited("bad pkey = 0x%x\n", pkey);
+ set_bad_pkey_cntr(port);
+ goto err1;
+ }
+ } else if (qpn != 0) {
+ if (unlikely(!pkey_match(pkey,
+ port->pkey_tbl[qp->attr.pkey_index]
+ ))) {
+ pr_warn_ratelimited("bad pkey = 0x%0x\n", pkey);
+ set_bad_pkey_cntr(port);
+ goto err1;
+ }
+ pkt->pkey_index = qp->attr.pkey_index;
+ }
+
+ if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) &&
+ qpn != 0 && pkt->mask) {
+ u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey;
+
+ if (unlikely(deth_qkey(pkt) != qkey)) {
+ pr_warn_ratelimited("bad qkey, got 0x%x expected 0x%x for qpn 0x%x\n",
+ deth_qkey(pkt), qkey, qpn);
+ set_qkey_viol_cntr(port);
+ goto err1;
+ }
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static int check_addr(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ struct rxe_qp *qp)
+{
+ struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+ if (qp_type(qp) != IB_QPT_RC && qp_type(qp) != IB_QPT_UC)
+ goto done;
+
+ if (unlikely(pkt->port_num != qp->attr.port_num)) {
+ pr_warn_ratelimited("port %d != qp port %d\n",
+ pkt->port_num, qp->attr.port_num);
+ goto err1;
+ }
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ struct in_addr *saddr =
+ &qp->pri_av.sgid_addr._sockaddr_in.sin_addr;
+ struct in_addr *daddr =
+ &qp->pri_av.dgid_addr._sockaddr_in.sin_addr;
+
+ if (ip_hdr(skb)->daddr != saddr->s_addr) {
+ pr_warn_ratelimited("dst addr %pI4 != qp source addr %pI4\n",
+ &ip_hdr(skb)->daddr,
+ &saddr->s_addr);
+ goto err1;
+ }
+
+ if (ip_hdr(skb)->saddr != daddr->s_addr) {
+ pr_warn_ratelimited("source addr %pI4 != qp dst addr %pI4\n",
+ &ip_hdr(skb)->saddr,
+ &daddr->s_addr);
+ goto err1;
+ }
+
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct in6_addr *saddr =
+ &qp->pri_av.sgid_addr._sockaddr_in6.sin6_addr;
+ struct in6_addr *daddr =
+ &qp->pri_av.dgid_addr._sockaddr_in6.sin6_addr;
+
+ if (memcmp(&ipv6_hdr(skb)->daddr, saddr, sizeof(*saddr))) {
+ pr_warn_ratelimited("dst addr %pI6 != qp source addr %pI6\n",
+ &ipv6_hdr(skb)->daddr, saddr);
+ goto err1;
+ }
+
+ if (memcmp(&ipv6_hdr(skb)->saddr, daddr, sizeof(*daddr))) {
+ pr_warn_ratelimited("source addr %pI6 != qp dst addr %pI6\n",
+ &ipv6_hdr(skb)->saddr, daddr);
+ goto err1;
+ }
+ }
+
+done:
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static int hdr_check(struct rxe_pkt_info *pkt)
+{
+ struct rxe_dev *rxe = pkt->rxe;
+ struct rxe_port *port = &rxe->port;
+ struct rxe_qp *qp = NULL;
+ u32 qpn = bth_qpn(pkt);
+ int index;
+ int err;
+
+ if (unlikely(bth_tver(pkt) != BTH_TVER)) {
+ pr_warn_ratelimited("bad tver\n");
+ goto err1;
+ }
+
+ if (qpn != IB_MULTICAST_QPN) {
+ index = (qpn == 0) ? port->qp_smi_index :
+ ((qpn == 1) ? port->qp_gsi_index : qpn);
+ qp = rxe_pool_get_index(&rxe->qp_pool, index);
+ if (unlikely(!qp)) {
+ pr_warn_ratelimited("no qp matches qpn 0x%x\n", qpn);
+ goto err1;
+ }
+
+ err = check_type_state(rxe, pkt, qp);
+ if (unlikely(err))
+ goto err2;
+
+ err = check_addr(rxe, pkt, qp);
+ if (unlikely(err))
+ goto err2;
+
+ err = check_keys(rxe, pkt, qpn, qp);
+ if (unlikely(err))
+ goto err2;
+ } else {
+ if (unlikely((pkt->mask & RXE_GRH_MASK) == 0)) {
+ pr_warn_ratelimited("no grh for mcast qpn\n");
+ goto err1;
+ }
+ }
+
+ pkt->qp = qp;
+ return 0;
+
+err2:
+ if (qp)
+ rxe_drop_ref(qp);
+err1:
+ return -EINVAL;
+}
+
+static inline void rxe_rcv_pkt(struct rxe_dev *rxe,
+ struct rxe_pkt_info *pkt,
+ struct sk_buff *skb)
+{
+ if (pkt->mask & RXE_REQ_MASK)
+ rxe_resp_queue_pkt(rxe, pkt->qp, skb);
+ else
+ rxe_comp_queue_pkt(rxe, pkt->qp, skb);
+}
+
+static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+ struct rxe_mc_grp *mcg;
+ struct sk_buff *skb_copy;
+ struct rxe_mc_elem *mce;
+ struct rxe_qp *qp;
+ union ib_gid dgid;
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+ (struct in6_addr *)&dgid);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid));
+
+ /* lookup mcast group corresponding to mgid, takes a ref */
+ mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid);
+ if (!mcg)
+ goto err1; /* mcast group not registered */
+
+ spin_lock_bh(&mcg->mcg_lock);
+
+ list_for_each_entry(mce, &mcg->qp_list, qp_list) {
+ qp = mce->qp;
+ pkt = SKB_TO_PKT(skb);
+
+ /* validate qp for incoming packet */
+ err = check_type_state(rxe, pkt, qp);
+ if (err)
+ continue;
+
+ err = check_keys(rxe, pkt, bth_qpn(pkt), qp);
+ if (err)
+ continue;
+
+ /* if *not* the last qp in the list
+ * make a copy of the skb to post to the next qp
+ */
+ skb_copy = (mce->qp_list.next != &mcg->qp_list) ?
+ skb_clone(skb, GFP_ATOMIC) : NULL;
+
+ pkt->qp = qp;
+ rxe_add_ref(qp);
+ rxe_rcv_pkt(rxe, pkt, skb);
+
+ skb = skb_copy;
+ if (!skb)
+ break;
+ }
+
+ spin_unlock_bh(&mcg->mcg_lock);
+
+ rxe_drop_ref(mcg); /* drop ref from rxe_pool_get_key. */
+
+err1:
+ if (skb)
+ kfree_skb(skb);
+}
+
+static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+ union ib_gid dgid;
+ union ib_gid *pdgid;
+ u16 index;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+ (struct in6_addr *)&dgid);
+ pdgid = &dgid;
+ } else {
+ pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr;
+ }
+
+ return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid,
+ IB_GID_TYPE_ROCE_UDP_ENCAP,
+ 1, rxe->ndev, &index);
+}
+
+/* rxe_rcv is called from the interface driver */
+int rxe_rcv(struct sk_buff *skb)
+{
+ int err;
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+ struct rxe_dev *rxe = pkt->rxe;
+ __be32 *icrcp;
+ u32 calc_icrc, pack_icrc;
+
+ pkt->offset = 0;
+
+ if (unlikely(skb->len < pkt->offset + RXE_BTH_BYTES))
+ goto drop;
+
+ if (unlikely(rxe_match_dgid(rxe, skb) < 0)) {
+ pr_warn_ratelimited("failed matching dgid\n");
+ goto drop;
+ }
+
+ pkt->opcode = bth_opcode(pkt);
+ pkt->psn = bth_psn(pkt);
+ pkt->qp = NULL;
+ pkt->mask |= rxe_opcode[pkt->opcode].mask;
+
+ if (unlikely(skb->len < header_size(pkt)))
+ goto drop;
+
+ err = hdr_check(pkt);
+ if (unlikely(err))
+ goto drop;
+
+ /* Verify ICRC */
+ icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE);
+ pack_icrc = be32_to_cpu(*icrcp);
+
+ calc_icrc = rxe_icrc_hdr(pkt, skb);
+ calc_icrc = crc32_le(calc_icrc, (u8 *)payload_addr(pkt), payload_size(pkt));
+ calc_icrc = cpu_to_be32(~calc_icrc);
+ if (unlikely(calc_icrc != pack_icrc)) {
+ char saddr[sizeof(struct in6_addr)];
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ sprintf(saddr, "%pI6", &ipv6_hdr(skb)->saddr);
+ else if (skb->protocol == htons(ETH_P_IP))
+ sprintf(saddr, "%pI4", &ip_hdr(skb)->saddr);
+ else
+ sprintf(saddr, "unknown");
+
+ pr_warn_ratelimited("bad ICRC from %s\n", saddr);
+ goto drop;
+ }
+
+ if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN))
+ rxe_rcv_mcast_pkt(rxe, skb);
+ else
+ rxe_rcv_pkt(rxe, pkt, skb);
+
+ return 0;
+
+drop:
+ if (pkt->qp)
+ rxe_drop_ref(pkt->qp);
+
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(rxe_rcv);
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
new file mode 100644
index 000000000000..13a848a518e8
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ unsigned opcode);
+
+static inline void retry_first_write_send(struct rxe_qp *qp,
+ struct rxe_send_wqe *wqe,
+ unsigned mask, int npsn)
+{
+ int i;
+
+ for (i = 0; i < npsn; i++) {
+ int to_send = (wqe->dma.resid > qp->mtu) ?
+ qp->mtu : wqe->dma.resid;
+
+ qp->req.opcode = next_opcode(qp, wqe,
+ wqe->wr.opcode);
+
+ if (wqe->wr.send_flags & IB_SEND_INLINE) {
+ wqe->dma.resid -= to_send;
+ wqe->dma.sge_offset += to_send;
+ } else {
+ advance_dma_data(&wqe->dma, to_send);
+ }
+ if (mask & WR_WRITE_MASK)
+ wqe->iova += qp->mtu;
+ }
+}
+
+static void req_retry(struct rxe_qp *qp)
+{
+ struct rxe_send_wqe *wqe;
+ unsigned int wqe_index;
+ unsigned int mask;
+ int npsn;
+ int first = 1;
+
+ wqe = queue_head(qp->sq.queue);
+ npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK;
+
+ qp->req.wqe_index = consumer_index(qp->sq.queue);
+ qp->req.psn = qp->comp.psn;
+ qp->req.opcode = -1;
+
+ for (wqe_index = consumer_index(qp->sq.queue);
+ wqe_index != producer_index(qp->sq.queue);
+ wqe_index = next_index(qp->sq.queue, wqe_index)) {
+ wqe = addr_from_index(qp->sq.queue, wqe_index);
+ mask = wr_opcode_mask(wqe->wr.opcode, qp);
+
+ if (wqe->state == wqe_state_posted)
+ break;
+
+ if (wqe->state == wqe_state_done)
+ continue;
+
+ wqe->iova = (mask & WR_ATOMIC_MASK) ?
+ wqe->wr.wr.atomic.remote_addr :
+ (mask & WR_READ_OR_WRITE_MASK) ?
+ wqe->wr.wr.rdma.remote_addr :
+ 0;
+
+ if (!first || (mask & WR_READ_MASK) == 0) {
+ wqe->dma.resid = wqe->dma.length;
+ wqe->dma.cur_sge = 0;
+ wqe->dma.sge_offset = 0;
+ }
+
+ if (first) {
+ first = 0;
+
+ if (mask & WR_WRITE_OR_SEND_MASK)
+ retry_first_write_send(qp, wqe, mask, npsn);
+
+ if (mask & WR_READ_MASK)
+ wqe->iova += npsn * qp->mtu;
+ }
+
+ wqe->state = wqe_state_posted;
+ }
+}
+
+void rnr_nak_timer(unsigned long data)
+{
+ struct rxe_qp *qp = (struct rxe_qp *)data;
+
+ pr_debug("rnr nak timer fired\n");
+ rxe_run_task(&qp->req.task, 1);
+}
+
+static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
+{
+ struct rxe_send_wqe *wqe = queue_head(qp->sq.queue);
+ unsigned long flags;
+
+ if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+ /* check to see if we are drained;
+ * state_lock used by requester and completer
+ */
+ spin_lock_irqsave(&qp->state_lock, flags);
+ do {
+ if (qp->req.state != QP_STATE_DRAIN) {
+ /* comp just finished */
+ spin_unlock_irqrestore(&qp->state_lock,
+ flags);
+ break;
+ }
+
+ if (wqe && ((qp->req.wqe_index !=
+ consumer_index(qp->sq.queue)) ||
+ (wqe->state != wqe_state_posted))) {
+ /* comp not done yet */
+ spin_unlock_irqrestore(&qp->state_lock,
+ flags);
+ break;
+ }
+
+ qp->req.state = QP_STATE_DRAINED;
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_SQ_DRAINED;
+ qp->ibqp.event_handler(&ev,
+ qp->ibqp.qp_context);
+ }
+ } while (0);
+ }
+
+ if (qp->req.wqe_index == producer_index(qp->sq.queue))
+ return NULL;
+
+ wqe = addr_from_index(qp->sq.queue, qp->req.wqe_index);
+
+ if (unlikely((qp->req.state == QP_STATE_DRAIN ||
+ qp->req.state == QP_STATE_DRAINED) &&
+ (wqe->state != wqe_state_processing)))
+ return NULL;
+
+ if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) &&
+ (qp->req.wqe_index != consumer_index(qp->sq.queue)))) {
+ qp->req.wait_fence = 1;
+ return NULL;
+ }
+
+ wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
+ return wqe;
+}
+
+static int next_opcode_rc(struct rxe_qp *qp, unsigned opcode, int fits)
+{
+ switch (opcode) {
+ case IB_WR_RDMA_WRITE:
+ if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_LAST :
+ IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_ONLY :
+ IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+ case IB_WR_SEND:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_SEND_LAST :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_SEND_ONLY :
+ IB_OPCODE_RC_SEND_FIRST;
+
+ case IB_WR_SEND_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_RC_SEND_FIRST;
+
+ case IB_WR_RDMA_READ:
+ return IB_OPCODE_RC_RDMA_READ_REQUEST;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ return IB_OPCODE_RC_COMPARE_SWAP;
+
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ return IB_OPCODE_RC_FETCH_ADD;
+
+ case IB_WR_SEND_WITH_INV:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ? IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE :
+ IB_OPCODE_RC_SEND_FIRST;
+ case IB_WR_REG_MR:
+ case IB_WR_LOCAL_INV:
+ return opcode;
+ }
+
+ return -EINVAL;
+}
+
+static int next_opcode_uc(struct rxe_qp *qp, unsigned opcode, int fits)
+{
+ switch (opcode) {
+ case IB_WR_RDMA_WRITE:
+ if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_LAST :
+ IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_ONLY :
+ IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+ case IB_WR_SEND:
+ if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_SEND_LAST :
+ IB_OPCODE_UC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_SEND_ONLY :
+ IB_OPCODE_UC_SEND_FIRST;
+
+ case IB_WR_SEND_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_UC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_UC_SEND_FIRST;
+ }
+
+ return -EINVAL;
+}
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ unsigned opcode)
+{
+ int fits = (wqe->dma.resid <= qp->mtu);
+
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ return next_opcode_rc(qp, opcode, fits);
+
+ case IB_QPT_UC:
+ return next_opcode_uc(qp, opcode, fits);
+
+ case IB_QPT_SMI:
+ case IB_QPT_UD:
+ case IB_QPT_GSI:
+ switch (opcode) {
+ case IB_WR_SEND:
+ return IB_OPCODE_UD_SEND_ONLY;
+
+ case IB_WR_SEND_WITH_IMM:
+ return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+static inline int check_init_depth(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ int depth;
+
+ if (wqe->has_rd_atomic)
+ return 0;
+
+ qp->req.need_rd_atomic = 1;
+ depth = atomic_dec_return(&qp->req.rd_atomic);
+
+ if (depth >= 0) {
+ qp->req.need_rd_atomic = 0;
+ wqe->has_rd_atomic = 1;
+ return 0;
+ }
+
+ atomic_inc(&qp->req.rd_atomic);
+ return -EAGAIN;
+}
+
+static inline int get_mtu(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_port *port;
+ struct rxe_av *av;
+
+ if ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))
+ return qp->mtu;
+
+ av = &wqe->av;
+ port = &rxe->port;
+
+ return port->mtu_cap;
+}
+
+static struct sk_buff *init_req_packet(struct rxe_qp *qp,
+ struct rxe_send_wqe *wqe,
+ int opcode, int payload,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_port *port = &rxe->port;
+ struct sk_buff *skb;
+ struct rxe_send_wr *ibwr = &wqe->wr;
+ struct rxe_av *av;
+ int pad = (-payload) & 0x3;
+ int paylen;
+ int solicited;
+ u16 pkey;
+ u32 qp_num;
+ int ack_req;
+
+ /* length from start of bth to end of icrc */
+ paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+
+ /* pkt->hdr, rxe, port_num and mask are initialized in ifc
+ * layer
+ */
+ pkt->opcode = opcode;
+ pkt->qp = qp;
+ pkt->psn = qp->req.psn;
+ pkt->mask = rxe_opcode[opcode].mask;
+ pkt->paylen = paylen;
+ pkt->offset = 0;
+ pkt->wqe = wqe;
+
+ /* init skb */
+ av = rxe_get_av(pkt);
+ skb = rxe->ifc_ops->init_packet(rxe, av, paylen, pkt);
+ if (unlikely(!skb))
+ return NULL;
+
+ /* init bth */
+ solicited = (ibwr->send_flags & IB_SEND_SOLICITED) &&
+ (pkt->mask & RXE_END_MASK) &&
+ ((pkt->mask & (RXE_SEND_MASK)) ||
+ (pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) ==
+ (RXE_WRITE_MASK | RXE_IMMDT_MASK));
+
+ pkey = (qp_type(qp) == IB_QPT_GSI) ?
+ port->pkey_tbl[ibwr->wr.ud.pkey_index] :
+ port->pkey_tbl[qp->attr.pkey_index];
+
+ qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn :
+ qp->attr.dest_qp_num;
+
+ ack_req = ((pkt->mask & RXE_END_MASK) ||
+ (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
+ if (ack_req)
+ qp->req.noack_pkts = 0;
+
+ bth_init(pkt, pkt->opcode, solicited, 0, pad, pkey, qp_num,
+ ack_req, pkt->psn);
+
+ /* init optional headers */
+ if (pkt->mask & RXE_RETH_MASK) {
+ reth_set_rkey(pkt, ibwr->wr.rdma.rkey);
+ reth_set_va(pkt, wqe->iova);
+ reth_set_len(pkt, wqe->dma.length);
+ }
+
+ if (pkt->mask & RXE_IMMDT_MASK)
+ immdt_set_imm(pkt, ibwr->ex.imm_data);
+
+ if (pkt->mask & RXE_IETH_MASK)
+ ieth_set_rkey(pkt, ibwr->ex.invalidate_rkey);
+
+ if (pkt->mask & RXE_ATMETH_MASK) {
+ atmeth_set_va(pkt, wqe->iova);
+ if (opcode == IB_OPCODE_RC_COMPARE_SWAP ||
+ opcode == IB_OPCODE_RD_COMPARE_SWAP) {
+ atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap);
+ atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add);
+ } else {
+ atmeth_set_swap_add(pkt, ibwr->wr.atomic.compare_add);
+ }
+ atmeth_set_rkey(pkt, ibwr->wr.atomic.rkey);
+ }
+
+ if (pkt->mask & RXE_DETH_MASK) {
+ if (qp->ibqp.qp_num == 1)
+ deth_set_qkey(pkt, GSI_QKEY);
+ else
+ deth_set_qkey(pkt, ibwr->wr.ud.remote_qkey);
+ deth_set_sqp(pkt, qp->ibqp.qp_num);
+ }
+
+ return skb;
+}
+
+static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ struct rxe_pkt_info *pkt, struct sk_buff *skb,
+ int paylen)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ u32 crc = 0;
+ u32 *p;
+ int err;
+
+ err = rxe->ifc_ops->prepare(rxe, pkt, skb, &crc);
+ if (err)
+ return err;
+
+ if (pkt->mask & RXE_WRITE_OR_SEND) {
+ if (wqe->wr.send_flags & IB_SEND_INLINE) {
+ u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
+
+ crc = crc32_le(crc, tmp, paylen);
+
+ memcpy(payload_addr(pkt), tmp, paylen);
+
+ wqe->dma.resid -= paylen;
+ wqe->dma.sge_offset += paylen;
+ } else {
+ err = copy_data(rxe, qp->pd, 0, &wqe->dma,
+ payload_addr(pkt), paylen,
+ from_mem_obj,
+ &crc);
+ if (err)
+ return err;
+ }
+ }
+ p = payload_addr(pkt) + paylen + bth_pad(pkt);
+
+ *p = ~crc;
+
+ return 0;
+}
+
+static void update_wqe_state(struct rxe_qp *qp,
+ struct rxe_send_wqe *wqe,
+ struct rxe_pkt_info *pkt)
+{
+ if (pkt->mask & RXE_END_MASK) {
+ if (qp_type(qp) == IB_QPT_RC)
+ wqe->state = wqe_state_pending;
+ } else {
+ wqe->state = wqe_state_processing;
+ }
+}
+
+static void update_wqe_psn(struct rxe_qp *qp,
+ struct rxe_send_wqe *wqe,
+ struct rxe_pkt_info *pkt,
+ int payload)
+{
+ /* number of packets left to send including current one */
+ int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu;
+
+ /* handle zero length packet case */
+ if (num_pkt == 0)
+ num_pkt = 1;
+
+ if (pkt->mask & RXE_START_MASK) {
+ wqe->first_psn = qp->req.psn;
+ wqe->last_psn = (qp->req.psn + num_pkt - 1) & BTH_PSN_MASK;
+ }
+
+ if (pkt->mask & RXE_READ_MASK)
+ qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK;
+ else
+ qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+}
+
+static void save_state(struct rxe_send_wqe *wqe,
+ struct rxe_qp *qp,
+ struct rxe_send_wqe *rollback_wqe,
+ struct rxe_qp *rollback_qp)
+{
+ rollback_wqe->state = wqe->state;
+ rollback_wqe->first_psn = wqe->first_psn;
+ rollback_wqe->last_psn = wqe->last_psn;
+ rollback_qp->req.psn = qp->req.psn;
+}
+
+static void rollback_state(struct rxe_send_wqe *wqe,
+ struct rxe_qp *qp,
+ struct rxe_send_wqe *rollback_wqe,
+ struct rxe_qp *rollback_qp)
+{
+ wqe->state = rollback_wqe->state;
+ wqe->first_psn = rollback_wqe->first_psn;
+ wqe->last_psn = rollback_wqe->last_psn;
+ qp->req.psn = rollback_qp->req.psn;
+}
+
+static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ struct rxe_pkt_info *pkt, int payload)
+{
+ qp->req.opcode = pkt->opcode;
+
+ if (pkt->mask & RXE_END_MASK)
+ qp->req.wqe_index = next_index(qp->sq.queue, qp->req.wqe_index);
+
+ qp->need_req_skb = 0;
+
+ if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer))
+ mod_timer(&qp->retrans_timer,
+ jiffies + qp->qp_timeout_jiffies);
+}
+
+int rxe_requester(void *arg)
+{
+ struct rxe_qp *qp = (struct rxe_qp *)arg;
+ struct rxe_pkt_info pkt;
+ struct sk_buff *skb;
+ struct rxe_send_wqe *wqe;
+ unsigned mask;
+ int payload;
+ int mtu;
+ int opcode;
+ int ret;
+ struct rxe_qp rollback_qp;
+ struct rxe_send_wqe rollback_wqe;
+
+next_wqe:
+ if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR))
+ goto exit;
+
+ if (unlikely(qp->req.state == QP_STATE_RESET)) {
+ qp->req.wqe_index = consumer_index(qp->sq.queue);
+ qp->req.opcode = -1;
+ qp->req.need_rd_atomic = 0;
+ qp->req.wait_psn = 0;
+ qp->req.need_retry = 0;
+ goto exit;
+ }
+
+ if (unlikely(qp->req.need_retry)) {
+ req_retry(qp);
+ qp->req.need_retry = 0;
+ }
+
+ wqe = req_next_wqe(qp);
+ if (unlikely(!wqe))
+ goto exit;
+
+ if (wqe->mask & WR_REG_MASK) {
+ if (wqe->wr.opcode == IB_WR_LOCAL_INV) {
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_mem *rmr;
+
+ rmr = rxe_pool_get_index(&rxe->mr_pool,
+ wqe->wr.ex.invalidate_rkey >> 8);
+ if (!rmr) {
+ pr_err("No mr for key %#x\n", wqe->wr.ex.invalidate_rkey);
+ wqe->state = wqe_state_error;
+ wqe->status = IB_WC_MW_BIND_ERR;
+ goto exit;
+ }
+ rmr->state = RXE_MEM_STATE_FREE;
+ wqe->state = wqe_state_done;
+ wqe->status = IB_WC_SUCCESS;
+ } else if (wqe->wr.opcode == IB_WR_REG_MR) {
+ struct rxe_mem *rmr = to_rmr(wqe->wr.wr.reg.mr);
+
+ rmr->state = RXE_MEM_STATE_VALID;
+ rmr->access = wqe->wr.wr.reg.access;
+ rmr->lkey = wqe->wr.wr.reg.key;
+ rmr->rkey = wqe->wr.wr.reg.key;
+ wqe->state = wqe_state_done;
+ wqe->status = IB_WC_SUCCESS;
+ } else {
+ goto exit;
+ }
+ qp->req.wqe_index = next_index(qp->sq.queue,
+ qp->req.wqe_index);
+ goto next_wqe;
+ }
+
+ if (unlikely(qp_type(qp) == IB_QPT_RC &&
+ qp->req.psn > (qp->comp.psn + RXE_MAX_UNACKED_PSNS))) {
+ qp->req.wait_psn = 1;
+ goto exit;
+ }
+
+ /* Limit the number of inflight SKBs per QP */
+ if (unlikely(atomic_read(&qp->skb_out) >
+ RXE_INFLIGHT_SKBS_PER_QP_HIGH)) {
+ qp->need_req_skb = 1;
+ goto exit;
+ }
+
+ opcode = next_opcode(qp, wqe, wqe->wr.opcode);
+ if (unlikely(opcode < 0)) {
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ goto exit;
+ }
+
+ mask = rxe_opcode[opcode].mask;
+ if (unlikely(mask & RXE_READ_OR_ATOMIC)) {
+ if (check_init_depth(qp, wqe))
+ goto exit;
+ }
+
+ mtu = get_mtu(qp, wqe);
+ payload = (mask & RXE_WRITE_OR_SEND) ? wqe->dma.resid : 0;
+ if (payload > mtu) {
+ if (qp_type(qp) == IB_QPT_UD) {
+ /* C10-93.1.1: If the total sum of all the buffer lengths specified for a
+ * UD message exceeds the MTU of the port as returned by QueryHCA, the CI
+ * shall not emit any packets for this message. Further, the CI shall not
+ * generate an error due to this condition.
+ */
+
+ /* fake a successful UD send */
+ wqe->first_psn = qp->req.psn;
+ wqe->last_psn = qp->req.psn;
+ qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+ qp->req.opcode = IB_OPCODE_UD_SEND_ONLY;
+ qp->req.wqe_index = next_index(qp->sq.queue,
+ qp->req.wqe_index);
+ wqe->state = wqe_state_done;
+ wqe->status = IB_WC_SUCCESS;
+ goto complete;
+ }
+ payload = mtu;
+ }
+
+ skb = init_req_packet(qp, wqe, opcode, payload, &pkt);
+ if (unlikely(!skb)) {
+ pr_err("Failed allocating skb\n");
+ goto err;
+ }
+
+ if (fill_packet(qp, wqe, &pkt, skb, payload)) {
+ pr_debug("Error during fill packet\n");
+ goto err;
+ }
+
+ /*
+ * To prevent a race on wqe access between requester and completer,
+ * wqe members state and psn need to be set before calling
+ * rxe_xmit_packet().
+ * Otherwise, completer might initiate an unjustified retry flow.
+ */
+ save_state(wqe, qp, &rollback_wqe, &rollback_qp);
+ update_wqe_state(qp, wqe, &pkt);
+ update_wqe_psn(qp, wqe, &pkt, payload);
+ ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb);
+ if (ret) {
+ qp->need_req_skb = 1;
+ kfree_skb(skb);
+
+ rollback_state(wqe, qp, &rollback_wqe, &rollback_qp);
+
+ if (ret == -EAGAIN) {
+ rxe_run_task(&qp->req.task, 1);
+ goto exit;
+ }
+
+ goto err;
+ }
+
+ update_state(qp, wqe, &pkt, payload);
+
+ goto next_wqe;
+
+err:
+ kfree_skb(skb);
+ wqe->status = IB_WC_LOC_PROT_ERR;
+ wqe->state = wqe_state_error;
+
+complete:
+ if (qp_type(qp) != IB_QPT_RC) {
+ while (rxe_completer(qp) == 0)
+ ;
+ }
+
+ return 0;
+
+exit:
+ return -EAGAIN;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
new file mode 100644
index 000000000000..3e0f0f2baace
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -0,0 +1,1381 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+enum resp_states {
+ RESPST_NONE,
+ RESPST_GET_REQ,
+ RESPST_CHK_PSN,
+ RESPST_CHK_OP_SEQ,
+ RESPST_CHK_OP_VALID,
+ RESPST_CHK_RESOURCE,
+ RESPST_CHK_LENGTH,
+ RESPST_CHK_RKEY,
+ RESPST_EXECUTE,
+ RESPST_READ_REPLY,
+ RESPST_COMPLETE,
+ RESPST_ACKNOWLEDGE,
+ RESPST_CLEANUP,
+ RESPST_DUPLICATE_REQUEST,
+ RESPST_ERR_MALFORMED_WQE,
+ RESPST_ERR_UNSUPPORTED_OPCODE,
+ RESPST_ERR_MISALIGNED_ATOMIC,
+ RESPST_ERR_PSN_OUT_OF_SEQ,
+ RESPST_ERR_MISSING_OPCODE_FIRST,
+ RESPST_ERR_MISSING_OPCODE_LAST_C,
+ RESPST_ERR_MISSING_OPCODE_LAST_D1E,
+ RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
+ RESPST_ERR_RNR,
+ RESPST_ERR_RKEY_VIOLATION,
+ RESPST_ERR_LENGTH,
+ RESPST_ERR_CQ_OVERFLOW,
+ RESPST_ERROR,
+ RESPST_RESET,
+ RESPST_DONE,
+ RESPST_EXIT,
+};
+
+static char *resp_state_name[] = {
+ [RESPST_NONE] = "NONE",
+ [RESPST_GET_REQ] = "GET_REQ",
+ [RESPST_CHK_PSN] = "CHK_PSN",
+ [RESPST_CHK_OP_SEQ] = "CHK_OP_SEQ",
+ [RESPST_CHK_OP_VALID] = "CHK_OP_VALID",
+ [RESPST_CHK_RESOURCE] = "CHK_RESOURCE",
+ [RESPST_CHK_LENGTH] = "CHK_LENGTH",
+ [RESPST_CHK_RKEY] = "CHK_RKEY",
+ [RESPST_EXECUTE] = "EXECUTE",
+ [RESPST_READ_REPLY] = "READ_REPLY",
+ [RESPST_COMPLETE] = "COMPLETE",
+ [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE",
+ [RESPST_CLEANUP] = "CLEANUP",
+ [RESPST_DUPLICATE_REQUEST] = "DUPLICATE_REQUEST",
+ [RESPST_ERR_MALFORMED_WQE] = "ERR_MALFORMED_WQE",
+ [RESPST_ERR_UNSUPPORTED_OPCODE] = "ERR_UNSUPPORTED_OPCODE",
+ [RESPST_ERR_MISALIGNED_ATOMIC] = "ERR_MISALIGNED_ATOMIC",
+ [RESPST_ERR_PSN_OUT_OF_SEQ] = "ERR_PSN_OUT_OF_SEQ",
+ [RESPST_ERR_MISSING_OPCODE_FIRST] = "ERR_MISSING_OPCODE_FIRST",
+ [RESPST_ERR_MISSING_OPCODE_LAST_C] = "ERR_MISSING_OPCODE_LAST_C",
+ [RESPST_ERR_MISSING_OPCODE_LAST_D1E] = "ERR_MISSING_OPCODE_LAST_D1E",
+ [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ",
+ [RESPST_ERR_RNR] = "ERR_RNR",
+ [RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION",
+ [RESPST_ERR_LENGTH] = "ERR_LENGTH",
+ [RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW",
+ [RESPST_ERROR] = "ERROR",
+ [RESPST_RESET] = "RESET",
+ [RESPST_DONE] = "DONE",
+ [RESPST_EXIT] = "EXIT",
+};
+
+/* rxe_recv calls here to add a request packet to the input queue */
+void rxe_resp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct sk_buff *skb)
+{
+ int must_sched;
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+ skb_queue_tail(&qp->req_pkts, skb);
+
+ must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) ||
+ (skb_queue_len(&qp->req_pkts) > 1);
+
+ rxe_run_task(&qp->resp.task, must_sched);
+}
+
+static inline enum resp_states get_req(struct rxe_qp *qp,
+ struct rxe_pkt_info **pkt_p)
+{
+ struct sk_buff *skb;
+
+ if (qp->resp.state == QP_STATE_ERROR) {
+ skb = skb_dequeue(&qp->req_pkts);
+ if (skb) {
+ /* drain request packet queue */
+ rxe_drop_ref(qp);
+ kfree_skb(skb);
+ return RESPST_GET_REQ;
+ }
+
+ /* go drain recv wr queue */
+ return RESPST_CHK_RESOURCE;
+ }
+
+ skb = skb_peek(&qp->req_pkts);
+ if (!skb)
+ return RESPST_EXIT;
+
+ *pkt_p = SKB_TO_PKT(skb);
+
+ return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN;
+}
+
+static enum resp_states check_psn(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ int diff = psn_compare(pkt->psn, qp->resp.psn);
+
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ if (diff > 0) {
+ if (qp->resp.sent_psn_nak)
+ return RESPST_CLEANUP;
+
+ qp->resp.sent_psn_nak = 1;
+ return RESPST_ERR_PSN_OUT_OF_SEQ;
+
+ } else if (diff < 0) {
+ return RESPST_DUPLICATE_REQUEST;
+ }
+
+ if (qp->resp.sent_psn_nak)
+ qp->resp.sent_psn_nak = 0;
+
+ break;
+
+ case IB_QPT_UC:
+ if (qp->resp.drop_msg || diff != 0) {
+ if (pkt->mask & RXE_START_MASK) {
+ qp->resp.drop_msg = 0;
+ return RESPST_CHK_OP_SEQ;
+ }
+
+ qp->resp.drop_msg = 1;
+ return RESPST_CLEANUP;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return RESPST_CHK_OP_SEQ;
+}
+
+static enum resp_states check_op_seq(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ switch (qp->resp.opcode) {
+ case IB_OPCODE_RC_SEND_FIRST:
+ case IB_OPCODE_RC_SEND_MIDDLE:
+ switch (pkt->opcode) {
+ case IB_OPCODE_RC_SEND_MIDDLE:
+ case IB_OPCODE_RC_SEND_LAST:
+ case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+ return RESPST_CHK_OP_VALID;
+ default:
+ return RESPST_ERR_MISSING_OPCODE_LAST_C;
+ }
+
+ case IB_OPCODE_RC_RDMA_WRITE_FIRST:
+ case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+ switch (pkt->opcode) {
+ case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+ case IB_OPCODE_RC_RDMA_WRITE_LAST:
+ case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ return RESPST_CHK_OP_VALID;
+ default:
+ return RESPST_ERR_MISSING_OPCODE_LAST_C;
+ }
+
+ default:
+ switch (pkt->opcode) {
+ case IB_OPCODE_RC_SEND_MIDDLE:
+ case IB_OPCODE_RC_SEND_LAST:
+ case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+ case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+ case IB_OPCODE_RC_RDMA_WRITE_LAST:
+ case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ return RESPST_ERR_MISSING_OPCODE_FIRST;
+ default:
+ return RESPST_CHK_OP_VALID;
+ }
+ }
+ break;
+
+ case IB_QPT_UC:
+ switch (qp->resp.opcode) {
+ case IB_OPCODE_UC_SEND_FIRST:
+ case IB_OPCODE_UC_SEND_MIDDLE:
+ switch (pkt->opcode) {
+ case IB_OPCODE_UC_SEND_MIDDLE:
+ case IB_OPCODE_UC_SEND_LAST:
+ case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+ return RESPST_CHK_OP_VALID;
+ default:
+ return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+ }
+
+ case IB_OPCODE_UC_RDMA_WRITE_FIRST:
+ case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+ switch (pkt->opcode) {
+ case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+ case IB_OPCODE_UC_RDMA_WRITE_LAST:
+ case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ return RESPST_CHK_OP_VALID;
+ default:
+ return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+ }
+
+ default:
+ switch (pkt->opcode) {
+ case IB_OPCODE_UC_SEND_MIDDLE:
+ case IB_OPCODE_UC_SEND_LAST:
+ case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+ case IB_OPCODE_UC_RDMA_WRITE_LAST:
+ case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ qp->resp.drop_msg = 1;
+ return RESPST_CLEANUP;
+ default:
+ return RESPST_CHK_OP_VALID;
+ }
+ }
+ break;
+
+ default:
+ return RESPST_CHK_OP_VALID;
+ }
+}
+
+static enum resp_states check_op_valid(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ if (((pkt->mask & RXE_READ_MASK) &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) ||
+ ((pkt->mask & RXE_WRITE_MASK) &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) ||
+ ((pkt->mask & RXE_ATOMIC_MASK) &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) {
+ return RESPST_ERR_UNSUPPORTED_OPCODE;
+ }
+
+ break;
+
+ case IB_QPT_UC:
+ if ((pkt->mask & RXE_WRITE_MASK) &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) {
+ qp->resp.drop_msg = 1;
+ return RESPST_CLEANUP;
+ }
+
+ break;
+
+ case IB_QPT_UD:
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ break;
+
+ default:
+ WARN_ON(1);
+ break;
+ }
+
+ return RESPST_CHK_RESOURCE;
+}
+
+static enum resp_states get_srq_wqe(struct rxe_qp *qp)
+{
+ struct rxe_srq *srq = qp->srq;
+ struct rxe_queue *q = srq->rq.queue;
+ struct rxe_recv_wqe *wqe;
+ struct ib_event ev;
+
+ if (srq->error)
+ return RESPST_ERR_RNR;
+
+ spin_lock_bh(&srq->rq.consumer_lock);
+
+ wqe = queue_head(q);
+ if (!wqe) {
+ spin_unlock_bh(&srq->rq.consumer_lock);
+ return RESPST_ERR_RNR;
+ }
+
+ /* note kernel and user space recv wqes have same size */
+ memcpy(&qp->resp.srq_wqe, wqe, sizeof(qp->resp.srq_wqe));
+
+ qp->resp.wqe = &qp->resp.srq_wqe.wqe;
+ advance_consumer(q);
+
+ if (srq->limit && srq->ibsrq.event_handler &&
+ (queue_count(q) < srq->limit)) {
+ srq->limit = 0;
+ goto event;
+ }
+
+ spin_unlock_bh(&srq->rq.consumer_lock);
+ return RESPST_CHK_LENGTH;
+
+event:
+ spin_unlock_bh(&srq->rq.consumer_lock);
+ ev.device = qp->ibqp.device;
+ ev.element.srq = qp->ibqp.srq;
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context);
+ return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states check_resource(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_srq *srq = qp->srq;
+
+ if (qp->resp.state == QP_STATE_ERROR) {
+ if (qp->resp.wqe) {
+ qp->resp.status = IB_WC_WR_FLUSH_ERR;
+ return RESPST_COMPLETE;
+ } else if (!srq) {
+ qp->resp.wqe = queue_head(qp->rq.queue);
+ if (qp->resp.wqe) {
+ qp->resp.status = IB_WC_WR_FLUSH_ERR;
+ return RESPST_COMPLETE;
+ } else {
+ return RESPST_EXIT;
+ }
+ } else {
+ return RESPST_EXIT;
+ }
+ }
+
+ if (pkt->mask & RXE_READ_OR_ATOMIC) {
+ /* it is the requesters job to not send
+ * too many read/atomic ops, we just
+ * recycle the responder resource queue
+ */
+ if (likely(qp->attr.max_rd_atomic > 0))
+ return RESPST_CHK_LENGTH;
+ else
+ return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ;
+ }
+
+ if (pkt->mask & RXE_RWR_MASK) {
+ if (srq)
+ return get_srq_wqe(qp);
+
+ qp->resp.wqe = queue_head(qp->rq.queue);
+ return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR;
+ }
+
+ return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states check_length(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ return RESPST_CHK_RKEY;
+
+ case IB_QPT_UC:
+ return RESPST_CHK_RKEY;
+
+ default:
+ return RESPST_CHK_RKEY;
+ }
+}
+
+static enum resp_states check_rkey(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_mem *mem;
+ u64 va;
+ u32 rkey;
+ u32 resid;
+ u32 pktlen;
+ int mtu = qp->mtu;
+ enum resp_states state;
+ int access;
+
+ if (pkt->mask & (RXE_READ_MASK | RXE_WRITE_MASK)) {
+ if (pkt->mask & RXE_RETH_MASK) {
+ qp->resp.va = reth_va(pkt);
+ qp->resp.rkey = reth_rkey(pkt);
+ qp->resp.resid = reth_len(pkt);
+ }
+ access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
+ : IB_ACCESS_REMOTE_WRITE;
+ } else if (pkt->mask & RXE_ATOMIC_MASK) {
+ qp->resp.va = atmeth_va(pkt);
+ qp->resp.rkey = atmeth_rkey(pkt);
+ qp->resp.resid = sizeof(u64);
+ access = IB_ACCESS_REMOTE_ATOMIC;
+ } else {
+ return RESPST_EXECUTE;
+ }
+
+ va = qp->resp.va;
+ rkey = qp->resp.rkey;
+ resid = qp->resp.resid;
+ pktlen = payload_size(pkt);
+
+ mem = lookup_mem(qp->pd, access, rkey, lookup_remote);
+ if (!mem) {
+ state = RESPST_ERR_RKEY_VIOLATION;
+ goto err1;
+ }
+
+ if (unlikely(mem->state == RXE_MEM_STATE_FREE)) {
+ state = RESPST_ERR_RKEY_VIOLATION;
+ goto err1;
+ }
+
+ if (mem_check_range(mem, va, resid)) {
+ state = RESPST_ERR_RKEY_VIOLATION;
+ goto err2;
+ }
+
+ if (pkt->mask & RXE_WRITE_MASK) {
+ if (resid > mtu) {
+ if (pktlen != mtu || bth_pad(pkt)) {
+ state = RESPST_ERR_LENGTH;
+ goto err2;
+ }
+
+ resid = mtu;
+ } else {
+ if (pktlen != resid) {
+ state = RESPST_ERR_LENGTH;
+ goto err2;
+ }
+ if ((bth_pad(pkt) != (0x3 & (-resid)))) {
+ /* This case may not be exactly that
+ * but nothing else fits.
+ */
+ state = RESPST_ERR_LENGTH;
+ goto err2;
+ }
+ }
+ }
+
+ WARN_ON(qp->resp.mr);
+
+ qp->resp.mr = mem;
+ return RESPST_EXECUTE;
+
+err2:
+ rxe_drop_ref(mem);
+err1:
+ return state;
+}
+
+static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr,
+ int data_len)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+ err = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma,
+ data_addr, data_len, to_mem_obj, NULL);
+ if (unlikely(err))
+ return (err == -ENOSPC) ? RESPST_ERR_LENGTH
+ : RESPST_ERR_MALFORMED_WQE;
+
+ return RESPST_NONE;
+}
+
+static enum resp_states write_data_in(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ enum resp_states rc = RESPST_NONE;
+ int err;
+ int data_len = payload_size(pkt);
+
+ err = rxe_mem_copy(qp->resp.mr, qp->resp.va, payload_addr(pkt),
+ data_len, to_mem_obj, NULL);
+ if (err) {
+ rc = RESPST_ERR_RKEY_VIOLATION;
+ goto out;
+ }
+
+ qp->resp.va += data_len;
+ qp->resp.resid -= data_len;
+
+out:
+ return rc;
+}
+
+/* Guarantee atomicity of atomic operations at the machine level. */
+static DEFINE_SPINLOCK(atomic_ops_lock);
+
+static enum resp_states process_atomic(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ u64 iova = atmeth_va(pkt);
+ u64 *vaddr;
+ enum resp_states ret;
+ struct rxe_mem *mr = qp->resp.mr;
+
+ if (mr->state != RXE_MEM_STATE_VALID) {
+ ret = RESPST_ERR_RKEY_VIOLATION;
+ goto out;
+ }
+
+ vaddr = iova_to_vaddr(mr, iova, sizeof(u64));
+
+ /* check vaddr is 8 bytes aligned. */
+ if (!vaddr || (uintptr_t)vaddr & 7) {
+ ret = RESPST_ERR_MISALIGNED_ATOMIC;
+ goto out;
+ }
+
+ spin_lock_bh(&atomic_ops_lock);
+
+ qp->resp.atomic_orig = *vaddr;
+
+ if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP ||
+ pkt->opcode == IB_OPCODE_RD_COMPARE_SWAP) {
+ if (*vaddr == atmeth_comp(pkt))
+ *vaddr = atmeth_swap_add(pkt);
+ } else {
+ *vaddr += atmeth_swap_add(pkt);
+ }
+
+ spin_unlock_bh(&atomic_ops_lock);
+
+ ret = RESPST_NONE;
+out:
+ return ret;
+}
+
+static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_pkt_info *ack,
+ int opcode,
+ int payload,
+ u32 psn,
+ u8 syndrome,
+ u32 *crcp)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct sk_buff *skb;
+ u32 crc = 0;
+ u32 *p;
+ int paylen;
+ int pad;
+ int err;
+
+ /*
+ * allocate packet
+ */
+ pad = (-payload) & 0x3;
+ paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+
+ skb = rxe->ifc_ops->init_packet(rxe, &qp->pri_av, paylen, ack);
+ if (!skb)
+ return NULL;
+
+ ack->qp = qp;
+ ack->opcode = opcode;
+ ack->mask = rxe_opcode[opcode].mask;
+ ack->offset = pkt->offset;
+ ack->paylen = paylen;
+
+ /* fill in bth using the request packet headers */
+ memcpy(ack->hdr, pkt->hdr, pkt->offset + RXE_BTH_BYTES);
+
+ bth_set_opcode(ack, opcode);
+ bth_set_qpn(ack, qp->attr.dest_qp_num);
+ bth_set_pad(ack, pad);
+ bth_set_se(ack, 0);
+ bth_set_psn(ack, psn);
+ bth_set_ack(ack, 0);
+ ack->psn = psn;
+
+ if (ack->mask & RXE_AETH_MASK) {
+ aeth_set_syn(ack, syndrome);
+ aeth_set_msn(ack, qp->resp.msn);
+ }
+
+ if (ack->mask & RXE_ATMACK_MASK)
+ atmack_set_orig(ack, qp->resp.atomic_orig);
+
+ err = rxe->ifc_ops->prepare(rxe, ack, skb, &crc);
+ if (err) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ if (crcp) {
+ /* CRC computation will be continued by the caller */
+ *crcp = crc;
+ } else {
+ p = payload_addr(ack) + payload + bth_pad(ack);
+ *p = ~crc;
+ }
+
+ return skb;
+}
+
+/* RDMA read response. If res is not NULL, then we have a current RDMA request
+ * being processed or replayed.
+ */
+static enum resp_states read_reply(struct rxe_qp *qp,
+ struct rxe_pkt_info *req_pkt)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_pkt_info ack_pkt;
+ struct sk_buff *skb;
+ int mtu = qp->mtu;
+ enum resp_states state;
+ int payload;
+ int opcode;
+ int err;
+ struct resp_res *res = qp->resp.res;
+ u32 icrc;
+ u32 *p;
+
+ if (!res) {
+ /* This is the first time we process that request. Get a
+ * resource
+ */
+ res = &qp->resp.resources[qp->resp.res_head];
+
+ free_rd_atomic_resource(qp, res);
+ rxe_advance_resp_resource(qp);
+
+ res->type = RXE_READ_MASK;
+
+ res->read.va = qp->resp.va;
+ res->read.va_org = qp->resp.va;
+
+ res->first_psn = req_pkt->psn;
+ res->last_psn = req_pkt->psn +
+ (reth_len(req_pkt) + mtu - 1) /
+ mtu - 1;
+ res->cur_psn = req_pkt->psn;
+
+ res->read.resid = qp->resp.resid;
+ res->read.length = qp->resp.resid;
+ res->read.rkey = qp->resp.rkey;
+
+ /* note res inherits the reference to mr from qp */
+ res->read.mr = qp->resp.mr;
+ qp->resp.mr = NULL;
+
+ qp->resp.res = res;
+ res->state = rdatm_res_state_new;
+ }
+
+ if (res->state == rdatm_res_state_new) {
+ if (res->read.resid <= mtu)
+ opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
+ else
+ opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
+ } else {
+ if (res->read.resid > mtu)
+ opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
+ else
+ opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
+ }
+
+ res->state = rdatm_res_state_next;
+
+ payload = min_t(int, res->read.resid, mtu);
+
+ skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload,
+ res->cur_psn, AETH_ACK_UNLIMITED, &icrc);
+ if (!skb)
+ return RESPST_ERR_RNR;
+
+ err = rxe_mem_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt),
+ payload, from_mem_obj, &icrc);
+ if (err)
+ pr_err("Failed copying memory\n");
+
+ p = payload_addr(&ack_pkt) + payload + bth_pad(&ack_pkt);
+ *p = ~icrc;
+
+ err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+ if (err) {
+ pr_err("Failed sending RDMA reply.\n");
+ kfree_skb(skb);
+ return RESPST_ERR_RNR;
+ }
+
+ res->read.va += payload;
+ res->read.resid -= payload;
+ res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK;
+
+ if (res->read.resid > 0) {
+ state = RESPST_DONE;
+ } else {
+ qp->resp.res = NULL;
+ qp->resp.opcode = -1;
+ qp->resp.psn = res->cur_psn;
+ state = RESPST_CLEANUP;
+ }
+
+ return state;
+}
+
+/* Executes a new request. A retried request never reach that function (send
+ * and writes are discarded, and reads and atomics are retried elsewhere.
+ */
+static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
+{
+ enum resp_states err;
+
+ if (pkt->mask & RXE_SEND_MASK) {
+ if (qp_type(qp) == IB_QPT_UD ||
+ qp_type(qp) == IB_QPT_SMI ||
+ qp_type(qp) == IB_QPT_GSI) {
+ union rdma_network_hdr hdr;
+ struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+ memset(&hdr, 0, sizeof(hdr));
+ if (skb->protocol == htons(ETH_P_IP))
+ memcpy(&hdr.roce4grh, ip_hdr(skb), sizeof(hdr.roce4grh));
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ memcpy(&hdr.ibgrh, ipv6_hdr(skb), sizeof(hdr.ibgrh));
+
+ err = send_data_in(qp, &hdr, sizeof(hdr));
+ if (err)
+ return err;
+ }
+ err = send_data_in(qp, payload_addr(pkt), payload_size(pkt));
+ if (err)
+ return err;
+ } else if (pkt->mask & RXE_WRITE_MASK) {
+ err = write_data_in(qp, pkt);
+ if (err)
+ return err;
+ } else if (pkt->mask & RXE_READ_MASK) {
+ /* For RDMA Read we can increment the msn now. See C9-148. */
+ qp->resp.msn++;
+ return RESPST_READ_REPLY;
+ } else if (pkt->mask & RXE_ATOMIC_MASK) {
+ err = process_atomic(qp, pkt);
+ if (err)
+ return err;
+ } else
+ /* Unreachable */
+ WARN_ON(1);
+
+ /* We successfully processed this new request. */
+ qp->resp.msn++;
+
+ /* next expected psn, read handles this separately */
+ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+ qp->resp.opcode = pkt->opcode;
+ qp->resp.status = IB_WC_SUCCESS;
+
+ if (pkt->mask & RXE_COMP_MASK)
+ return RESPST_COMPLETE;
+ else if (qp_type(qp) == IB_QPT_RC)
+ return RESPST_ACKNOWLEDGE;
+ else
+ return RESPST_CLEANUP;
+}
+
+static enum resp_states do_complete(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_cqe cqe;
+ struct ib_wc *wc = &cqe.ibwc;
+ struct ib_uverbs_wc *uwc = &cqe.uibwc;
+ struct rxe_recv_wqe *wqe = qp->resp.wqe;
+
+ if (unlikely(!wqe))
+ return RESPST_CLEANUP;
+
+ memset(&cqe, 0, sizeof(cqe));
+
+ wc->wr_id = wqe->wr_id;
+ wc->status = qp->resp.status;
+ wc->qp = &qp->ibqp;
+
+ /* fields after status are not required for errors */
+ if (wc->status == IB_WC_SUCCESS) {
+ wc->opcode = (pkt->mask & RXE_IMMDT_MASK &&
+ pkt->mask & RXE_WRITE_MASK) ?
+ IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
+ wc->vendor_err = 0;
+ wc->byte_len = wqe->dma.length - wqe->dma.resid;
+
+ /* fields after byte_len are different between kernel and user
+ * space
+ */
+ if (qp->rcq->is_user) {
+ uwc->wc_flags = IB_WC_GRH;
+
+ if (pkt->mask & RXE_IMMDT_MASK) {
+ uwc->wc_flags |= IB_WC_WITH_IMM;
+ uwc->ex.imm_data =
+ (__u32 __force)immdt_imm(pkt);
+ }
+
+ if (pkt->mask & RXE_IETH_MASK) {
+ uwc->wc_flags |= IB_WC_WITH_INVALIDATE;
+ uwc->ex.invalidate_rkey = ieth_rkey(pkt);
+ }
+
+ uwc->qp_num = qp->ibqp.qp_num;
+
+ if (pkt->mask & RXE_DETH_MASK)
+ uwc->src_qp = deth_sqp(pkt);
+
+ uwc->port_num = qp->attr.port_num;
+ } else {
+ struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+ wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE;
+ if (skb->protocol == htons(ETH_P_IP))
+ wc->network_hdr_type = RDMA_NETWORK_IPV4;
+ else
+ wc->network_hdr_type = RDMA_NETWORK_IPV6;
+
+ if (pkt->mask & RXE_IMMDT_MASK) {
+ wc->wc_flags |= IB_WC_WITH_IMM;
+ wc->ex.imm_data = immdt_imm(pkt);
+ }
+
+ if (pkt->mask & RXE_IETH_MASK) {
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_mem *rmr;
+
+ wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+ wc->ex.invalidate_rkey = ieth_rkey(pkt);
+
+ rmr = rxe_pool_get_index(&rxe->mr_pool,
+ wc->ex.invalidate_rkey >> 8);
+ if (unlikely(!rmr)) {
+ pr_err("Bad rkey %#x invalidation\n", wc->ex.invalidate_rkey);
+ return RESPST_ERROR;
+ }
+ rmr->state = RXE_MEM_STATE_FREE;
+ }
+
+ wc->qp = &qp->ibqp;
+
+ if (pkt->mask & RXE_DETH_MASK)
+ wc->src_qp = deth_sqp(pkt);
+
+ wc->port_num = qp->attr.port_num;
+ }
+ }
+
+ /* have copy for srq and reference for !srq */
+ if (!qp->srq)
+ advance_consumer(qp->rq.queue);
+
+ qp->resp.wqe = NULL;
+
+ if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1))
+ return RESPST_ERR_CQ_OVERFLOW;
+
+ if (qp->resp.state == QP_STATE_ERROR)
+ return RESPST_CHK_RESOURCE;
+
+ if (!pkt)
+ return RESPST_DONE;
+ else if (qp_type(qp) == IB_QPT_RC)
+ return RESPST_ACKNOWLEDGE;
+ else
+ return RESPST_CLEANUP;
+}
+
+static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+ u8 syndrome, u32 psn)
+{
+ int err = 0;
+ struct rxe_pkt_info ack_pkt;
+ struct sk_buff *skb;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+ skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE,
+ 0, psn, syndrome, NULL);
+ if (!skb) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ err = rxe_xmit_packet(rxe, qp, &ack_pkt, skb);
+ if (err) {
+ pr_err_ratelimited("Failed sending ack\n");
+ kfree_skb(skb);
+ }
+
+err1:
+ return err;
+}
+
+static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+ u8 syndrome)
+{
+ int rc = 0;
+ struct rxe_pkt_info ack_pkt;
+ struct sk_buff *skb;
+ struct sk_buff *skb_copy;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct resp_res *res;
+
+ skb = prepare_ack_packet(qp, pkt, &ack_pkt,
+ IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn,
+ syndrome, NULL);
+ if (!skb) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ skb_copy = skb_clone(skb, GFP_ATOMIC);
+ if (skb_copy)
+ rxe_add_ref(qp); /* for the new SKB */
+ else {
+ pr_warn("Could not clone atomic response\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ res = &qp->resp.resources[qp->resp.res_head];
+ free_rd_atomic_resource(qp, res);
+ rxe_advance_resp_resource(qp);
+
+ memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(skb->cb));
+
+ res->type = RXE_ATOMIC_MASK;
+ res->atomic.skb = skb;
+ res->first_psn = ack_pkt.psn;
+ res->last_psn = ack_pkt.psn;
+ res->cur_psn = ack_pkt.psn;
+
+ rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb_copy);
+ if (rc) {
+ pr_err_ratelimited("Failed sending ack\n");
+ rxe_drop_ref(qp);
+ kfree_skb(skb_copy);
+ }
+
+out:
+ return rc;
+}
+
+static enum resp_states acknowledge(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ if (qp_type(qp) != IB_QPT_RC)
+ return RESPST_CLEANUP;
+
+ if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
+ send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn);
+ else if (pkt->mask & RXE_ATOMIC_MASK)
+ send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED);
+ else if (bth_ack(pkt))
+ send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn);
+
+ return RESPST_CLEANUP;
+}
+
+static enum resp_states cleanup(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct sk_buff *skb;
+
+ if (pkt) {
+ skb = skb_dequeue(&qp->req_pkts);
+ rxe_drop_ref(qp);
+ kfree_skb(skb);
+ }
+
+ if (qp->resp.mr) {
+ rxe_drop_ref(qp->resp.mr);
+ qp->resp.mr = NULL;
+ }
+
+ return RESPST_DONE;
+}
+
+static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn)
+{
+ int i;
+
+ for (i = 0; i < qp->attr.max_rd_atomic; i++) {
+ struct resp_res *res = &qp->resp.resources[i];
+
+ if (res->type == 0)
+ continue;
+
+ if (psn_compare(psn, res->first_psn) >= 0 &&
+ psn_compare(psn, res->last_psn) <= 0) {
+ return res;
+ }
+ }
+
+ return NULL;
+}
+
+static enum resp_states duplicate_request(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ enum resp_states rc;
+
+ if (pkt->mask & RXE_SEND_MASK ||
+ pkt->mask & RXE_WRITE_MASK) {
+ /* SEND. Ack again and cleanup. C9-105. */
+ if (bth_ack(pkt))
+ send_ack(qp, pkt, AETH_ACK_UNLIMITED, qp->resp.psn - 1);
+ rc = RESPST_CLEANUP;
+ goto out;
+ } else if (pkt->mask & RXE_READ_MASK) {
+ struct resp_res *res;
+
+ res = find_resource(qp, pkt->psn);
+ if (!res) {
+ /* Resource not found. Class D error. Drop the
+ * request.
+ */
+ rc = RESPST_CLEANUP;
+ goto out;
+ } else {
+ /* Ensure this new request is the same as the previous
+ * one or a subset of it.
+ */
+ u64 iova = reth_va(pkt);
+ u32 resid = reth_len(pkt);
+
+ if (iova < res->read.va_org ||
+ resid > res->read.length ||
+ (iova + resid) > (res->read.va_org +
+ res->read.length)) {
+ rc = RESPST_CLEANUP;
+ goto out;
+ }
+
+ if (reth_rkey(pkt) != res->read.rkey) {
+ rc = RESPST_CLEANUP;
+ goto out;
+ }
+
+ res->cur_psn = pkt->psn;
+ res->state = (pkt->psn == res->first_psn) ?
+ rdatm_res_state_new :
+ rdatm_res_state_replay;
+
+ /* Reset the resource, except length. */
+ res->read.va_org = iova;
+ res->read.va = iova;
+ res->read.resid = resid;
+
+ /* Replay the RDMA read reply. */
+ qp->resp.res = res;
+ rc = RESPST_READ_REPLY;
+ goto out;
+ }
+ } else {
+ struct resp_res *res;
+
+ /* Find the operation in our list of responder resources. */
+ res = find_resource(qp, pkt->psn);
+ if (res) {
+ struct sk_buff *skb_copy;
+
+ skb_copy = skb_clone(res->atomic.skb, GFP_ATOMIC);
+ if (skb_copy) {
+ rxe_add_ref(qp); /* for the new SKB */
+ } else {
+ pr_warn("Couldn't clone atomic resp\n");
+ rc = RESPST_CLEANUP;
+ goto out;
+ }
+
+ /* Resend the result. */
+ rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp,
+ pkt, skb_copy);
+ if (rc) {
+ pr_err("Failed resending result. This flow is not handled - skb ignored\n");
+ kfree_skb(skb_copy);
+ rc = RESPST_CLEANUP;
+ goto out;
+ }
+ }
+
+ /* Resource not found. Class D error. Drop the request. */
+ rc = RESPST_CLEANUP;
+ goto out;
+ }
+out:
+ return rc;
+}
+
+/* Process a class A or C. Both are treated the same in this implementation. */
+static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome,
+ enum ib_wc_status status)
+{
+ qp->resp.aeth_syndrome = syndrome;
+ qp->resp.status = status;
+
+ /* indicate that we should go through the ERROR state */
+ qp->resp.goto_error = 1;
+}
+
+static enum resp_states do_class_d1e_error(struct rxe_qp *qp)
+{
+ /* UC */
+ if (qp->srq) {
+ /* Class E */
+ qp->resp.drop_msg = 1;
+ if (qp->resp.wqe) {
+ qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+ return RESPST_COMPLETE;
+ } else {
+ return RESPST_CLEANUP;
+ }
+ } else {
+ /* Class D1. This packet may be the start of a
+ * new message and could be valid. The previous
+ * message is invalid and ignored. reset the
+ * recv wr to its original state
+ */
+ if (qp->resp.wqe) {
+ qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length;
+ qp->resp.wqe->dma.cur_sge = 0;
+ qp->resp.wqe->dma.sge_offset = 0;
+ qp->resp.opcode = -1;
+ }
+
+ if (qp->resp.mr) {
+ rxe_drop_ref(qp->resp.mr);
+ qp->resp.mr = NULL;
+ }
+
+ return RESPST_CLEANUP;
+ }
+}
+
+int rxe_responder(void *arg)
+{
+ struct rxe_qp *qp = (struct rxe_qp *)arg;
+ enum resp_states state;
+ struct rxe_pkt_info *pkt = NULL;
+ int ret = 0;
+
+ qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED;
+
+ if (!qp->valid) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ switch (qp->resp.state) {
+ case QP_STATE_RESET:
+ state = RESPST_RESET;
+ break;
+
+ default:
+ state = RESPST_GET_REQ;
+ break;
+ }
+
+ while (1) {
+ pr_debug("state = %s\n", resp_state_name[state]);
+ switch (state) {
+ case RESPST_GET_REQ:
+ state = get_req(qp, &pkt);
+ break;
+ case RESPST_CHK_PSN:
+ state = check_psn(qp, pkt);
+ break;
+ case RESPST_CHK_OP_SEQ:
+ state = check_op_seq(qp, pkt);
+ break;
+ case RESPST_CHK_OP_VALID:
+ state = check_op_valid(qp, pkt);
+ break;
+ case RESPST_CHK_RESOURCE:
+ state = check_resource(qp, pkt);
+ break;
+ case RESPST_CHK_LENGTH:
+ state = check_length(qp, pkt);
+ break;
+ case RESPST_CHK_RKEY:
+ state = check_rkey(qp, pkt);
+ break;
+ case RESPST_EXECUTE:
+ state = execute(qp, pkt);
+ break;
+ case RESPST_COMPLETE:
+ state = do_complete(qp, pkt);
+ break;
+ case RESPST_READ_REPLY:
+ state = read_reply(qp, pkt);
+ break;
+ case RESPST_ACKNOWLEDGE:
+ state = acknowledge(qp, pkt);
+ break;
+ case RESPST_CLEANUP:
+ state = cleanup(qp, pkt);
+ break;
+ case RESPST_DUPLICATE_REQUEST:
+ state = duplicate_request(qp, pkt);
+ break;
+ case RESPST_ERR_PSN_OUT_OF_SEQ:
+ /* RC only - Class B. Drop packet. */
+ send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn);
+ state = RESPST_CLEANUP;
+ break;
+
+ case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ:
+ case RESPST_ERR_MISSING_OPCODE_FIRST:
+ case RESPST_ERR_MISSING_OPCODE_LAST_C:
+ case RESPST_ERR_UNSUPPORTED_OPCODE:
+ case RESPST_ERR_MISALIGNED_ATOMIC:
+ /* RC Only - Class C. */
+ do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+ IB_WC_REM_INV_REQ_ERR);
+ state = RESPST_COMPLETE;
+ break;
+
+ case RESPST_ERR_MISSING_OPCODE_LAST_D1E:
+ state = do_class_d1e_error(qp);
+ break;
+ case RESPST_ERR_RNR:
+ if (qp_type(qp) == IB_QPT_RC) {
+ /* RC - class B */
+ send_ack(qp, pkt, AETH_RNR_NAK |
+ (~AETH_TYPE_MASK &
+ qp->attr.min_rnr_timer),
+ pkt->psn);
+ } else {
+ /* UD/UC - class D */
+ qp->resp.drop_msg = 1;
+ }
+ state = RESPST_CLEANUP;
+ break;
+
+ case RESPST_ERR_RKEY_VIOLATION:
+ if (qp_type(qp) == IB_QPT_RC) {
+ /* Class C */
+ do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR,
+ IB_WC_REM_ACCESS_ERR);
+ state = RESPST_COMPLETE;
+ } else {
+ qp->resp.drop_msg = 1;
+ if (qp->srq) {
+ /* UC/SRQ Class D */
+ qp->resp.status = IB_WC_REM_ACCESS_ERR;
+ state = RESPST_COMPLETE;
+ } else {
+ /* UC/non-SRQ Class E. */
+ state = RESPST_CLEANUP;
+ }
+ }
+ break;
+
+ case RESPST_ERR_LENGTH:
+ if (qp_type(qp) == IB_QPT_RC) {
+ /* Class C */
+ do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+ IB_WC_REM_INV_REQ_ERR);
+ state = RESPST_COMPLETE;
+ } else if (qp->srq) {
+ /* UC/UD - class E */
+ qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+ state = RESPST_COMPLETE;
+ } else {
+ /* UC/UD - class D */
+ qp->resp.drop_msg = 1;
+ state = RESPST_CLEANUP;
+ }
+ break;
+
+ case RESPST_ERR_MALFORMED_WQE:
+ /* All, Class A. */
+ do_class_ac_error(qp, AETH_NAK_REM_OP_ERR,
+ IB_WC_LOC_QP_OP_ERR);
+ state = RESPST_COMPLETE;
+ break;
+
+ case RESPST_ERR_CQ_OVERFLOW:
+ /* All - Class G */
+ state = RESPST_ERROR;
+ break;
+
+ case RESPST_DONE:
+ if (qp->resp.goto_error) {
+ state = RESPST_ERROR;
+ break;
+ }
+
+ goto done;
+
+ case RESPST_EXIT:
+ if (qp->resp.goto_error) {
+ state = RESPST_ERROR;
+ break;
+ }
+
+ goto exit;
+
+ case RESPST_RESET: {
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&qp->req_pkts))) {
+ rxe_drop_ref(qp);
+ kfree_skb(skb);
+ }
+
+ while (!qp->srq && qp->rq.queue &&
+ queue_head(qp->rq.queue))
+ advance_consumer(qp->rq.queue);
+
+ qp->resp.wqe = NULL;
+ goto exit;
+ }
+
+ case RESPST_ERROR:
+ qp->resp.goto_error = 0;
+ pr_warn("qp#%d moved to error state\n", qp_num(qp));
+ rxe_qp_error(qp);
+ goto exit;
+
+ default:
+ WARN_ON(1);
+ }
+ }
+
+exit:
+ ret = -EAGAIN;
+done:
+ return ret;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
new file mode 100644
index 000000000000..2a6e3cd2d4e8
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_srq.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask)
+{
+ if (srq && srq->error) {
+ pr_warn("srq in error state\n");
+ goto err1;
+ }
+
+ if (mask & IB_SRQ_MAX_WR) {
+ if (attr->max_wr > rxe->attr.max_srq_wr) {
+ pr_warn("max_wr(%d) > max_srq_wr(%d)\n",
+ attr->max_wr, rxe->attr.max_srq_wr);
+ goto err1;
+ }
+
+ if (attr->max_wr <= 0) {
+ pr_warn("max_wr(%d) <= 0\n", attr->max_wr);
+ goto err1;
+ }
+
+ if (srq && srq->limit && (attr->max_wr < srq->limit)) {
+ pr_warn("max_wr (%d) < srq->limit (%d)\n",
+ attr->max_wr, srq->limit);
+ goto err1;
+ }
+
+ if (attr->max_wr < RXE_MIN_SRQ_WR)
+ attr->max_wr = RXE_MIN_SRQ_WR;
+ }
+
+ if (mask & IB_SRQ_LIMIT) {
+ if (attr->srq_limit > rxe->attr.max_srq_wr) {
+ pr_warn("srq_limit(%d) > max_srq_wr(%d)\n",
+ attr->srq_limit, rxe->attr.max_srq_wr);
+ goto err1;
+ }
+
+ if (srq && (attr->srq_limit > srq->rq.queue->buf->index_mask)) {
+ pr_warn("srq_limit (%d) > cur limit(%d)\n",
+ attr->srq_limit,
+ srq->rq.queue->buf->index_mask);
+ goto err1;
+ }
+ }
+
+ if (mask == IB_SRQ_INIT_MASK) {
+ if (attr->max_sge > rxe->attr.max_srq_sge) {
+ pr_warn("max_sge(%d) > max_srq_sge(%d)\n",
+ attr->max_sge, rxe->attr.max_srq_sge);
+ goto err1;
+ }
+
+ if (attr->max_sge < RXE_MIN_SRQ_SGE)
+ attr->max_sge = RXE_MIN_SRQ_SGE;
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_init_attr *init,
+ struct ib_ucontext *context, struct ib_udata *udata)
+{
+ int err;
+ int srq_wqe_size;
+ struct rxe_queue *q;
+
+ srq->ibsrq.event_handler = init->event_handler;
+ srq->ibsrq.srq_context = init->srq_context;
+ srq->limit = init->attr.srq_limit;
+ srq->srq_num = srq->pelem.index;
+ srq->rq.max_wr = init->attr.max_wr;
+ srq->rq.max_sge = init->attr.max_sge;
+
+ srq_wqe_size = rcv_wqe_size(srq->rq.max_sge);
+
+ spin_lock_init(&srq->rq.producer_lock);
+ spin_lock_init(&srq->rq.consumer_lock);
+
+ q = rxe_queue_init(rxe, &srq->rq.max_wr,
+ srq_wqe_size);
+ if (!q) {
+ pr_warn("unable to allocate queue for srq\n");
+ return -ENOMEM;
+ }
+
+ srq->rq.queue = q;
+
+ err = do_mmap_info(rxe, udata, false, context, q->buf,
+ q->buf_size, &q->ip);
+ if (err)
+ return err;
+
+ if (udata && udata->outlen >= sizeof(struct mminfo) + sizeof(u32)) {
+ if (copy_to_user(udata->outbuf + sizeof(struct mminfo),
+ &srq->srq_num, sizeof(u32)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+ struct ib_udata *udata)
+{
+ int err;
+ struct rxe_queue *q = srq->rq.queue;
+ struct mminfo mi = { .offset = 1, .size = 0};
+
+ if (mask & IB_SRQ_MAX_WR) {
+ /* Check that we can write the mminfo struct to user space */
+ if (udata && udata->inlen >= sizeof(__u64)) {
+ __u64 mi_addr;
+
+ /* Get address of user space mminfo struct */
+ err = ib_copy_from_udata(&mi_addr, udata,
+ sizeof(mi_addr));
+ if (err)
+ goto err1;
+
+ udata->outbuf = (void __user *)(unsigned long)mi_addr;
+ udata->outlen = sizeof(mi);
+
+ if (!access_ok(VERIFY_WRITE,
+ (void __user *)udata->outbuf,
+ udata->outlen)) {
+ err = -EFAULT;
+ goto err1;
+ }
+ }
+
+ err = rxe_queue_resize(q, (unsigned int *)&attr->max_wr,
+ rcv_wqe_size(srq->rq.max_sge),
+ srq->rq.queue->ip ?
+ srq->rq.queue->ip->context :
+ NULL,
+ udata, &srq->rq.producer_lock,
+ &srq->rq.consumer_lock);
+ if (err)
+ goto err2;
+ }
+
+ if (mask & IB_SRQ_LIMIT)
+ srq->limit = attr->srq_limit;
+
+ return 0;
+
+err2:
+ rxe_queue_cleanup(q);
+ srq->rq.queue = NULL;
+err1:
+ return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c
new file mode 100644
index 000000000000..cf8e77800046
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_net.h"
+
+/* Copy argument and remove trailing CR. Return the new length. */
+static int sanitize_arg(const char *val, char *intf, int intf_len)
+{
+ int len;
+
+ if (!val)
+ return 0;
+
+ /* Remove newline. */
+ for (len = 0; len < intf_len - 1 && val[len] && val[len] != '\n'; len++)
+ intf[len] = val[len];
+ intf[len] = 0;
+
+ if (len == 0 || (val[len] != 0 && val[len] != '\n'))
+ return 0;
+
+ return len;
+}
+
+static void rxe_set_port_state(struct net_device *ndev)
+{
+ struct rxe_dev *rxe = net_to_rxe(ndev);
+ bool is_up = netif_running(ndev) && netif_carrier_ok(ndev);
+
+ if (!rxe)
+ goto out;
+
+ if (is_up)
+ rxe_port_up(rxe);
+ else
+ rxe_port_down(rxe); /* down for unknown state */
+out:
+ return;
+}
+
+static int rxe_param_set_add(const char *val, const struct kernel_param *kp)
+{
+ int len;
+ int err = 0;
+ char intf[32];
+ struct net_device *ndev = NULL;
+ struct rxe_dev *rxe;
+
+ len = sanitize_arg(val, intf, sizeof(intf));
+ if (!len) {
+ pr_err("rxe: add: invalid interface name\n");
+ err = -EINVAL;
+ goto err;
+ }
+
+ ndev = dev_get_by_name(&init_net, intf);
+ if (!ndev) {
+ pr_err("interface %s not found\n", intf);
+ err = -EINVAL;
+ goto err;
+ }
+
+ if (net_to_rxe(ndev)) {
+ pr_err("rxe: already configured on %s\n", intf);
+ err = -EINVAL;
+ goto err;
+ }
+
+ rxe = rxe_net_add(ndev);
+ if (!rxe) {
+ pr_err("rxe: failed to add %s\n", intf);
+ err = -EINVAL;
+ goto err;
+ }
+
+ rxe_set_port_state(ndev);
+ pr_info("rxe: added %s to %s\n", rxe->ib_dev.name, intf);
+err:
+ if (ndev)
+ dev_put(ndev);
+ return err;
+}
+
+static int rxe_param_set_remove(const char *val, const struct kernel_param *kp)
+{
+ int len;
+ char intf[32];
+ struct rxe_dev *rxe;
+
+ len = sanitize_arg(val, intf, sizeof(intf));
+ if (!len) {
+ pr_err("rxe: add: invalid interface name\n");
+ return -EINVAL;
+ }
+
+ if (strncmp("all", intf, len) == 0) {
+ pr_info("rxe_sys: remove all");
+ rxe_remove_all();
+ return 0;
+ }
+
+ rxe = get_rxe_by_name(intf);
+
+ if (!rxe) {
+ pr_err("rxe: not configured on %s\n", intf);
+ return -EINVAL;
+ }
+
+ list_del(&rxe->list);
+ rxe_remove(rxe);
+
+ return 0;
+}
+
+static const struct kernel_param_ops rxe_add_ops = {
+ .set = rxe_param_set_add,
+};
+
+static const struct kernel_param_ops rxe_remove_ops = {
+ .set = rxe_param_set_remove,
+};
+
+module_param_cb(add, &rxe_add_ops, NULL, 0200);
+MODULE_PARM_DESC(add, "Create RXE device over network interface");
+module_param_cb(remove, &rxe_remove_ops, NULL, 0200);
+MODULE_PARM_DESC(remove, "Remove RXE device over network interface");
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
new file mode 100644
index 000000000000..1e19bf828a6e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_task.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/hardirq.h>
+
+#include "rxe_task.h"
+
+int __rxe_do_task(struct rxe_task *task)
+
+{
+ int ret;
+
+ while ((ret = task->func(task->arg)) == 0)
+ ;
+
+ task->ret = ret;
+
+ return ret;
+}
+
+/*
+ * this locking is due to a potential race where
+ * a second caller finds the task already running
+ * but looks just after the last call to func
+ */
+void rxe_do_task(unsigned long data)
+{
+ int cont;
+ int ret;
+ unsigned long flags;
+ struct rxe_task *task = (struct rxe_task *)data;
+
+ spin_lock_irqsave(&task->state_lock, flags);
+ switch (task->state) {
+ case TASK_STATE_START:
+ task->state = TASK_STATE_BUSY;
+ spin_unlock_irqrestore(&task->state_lock, flags);
+ break;
+
+ case TASK_STATE_BUSY:
+ task->state = TASK_STATE_ARMED;
+ /* fall through to */
+ case TASK_STATE_ARMED:
+ spin_unlock_irqrestore(&task->state_lock, flags);
+ return;
+
+ default:
+ spin_unlock_irqrestore(&task->state_lock, flags);
+ pr_warn("bad state = %d in rxe_do_task\n", task->state);
+ return;
+ }
+
+ do {
+ cont = 0;
+ ret = task->func(task->arg);
+
+ spin_lock_irqsave(&task->state_lock, flags);
+ switch (task->state) {
+ case TASK_STATE_BUSY:
+ if (ret)
+ task->state = TASK_STATE_START;
+ else
+ cont = 1;
+ break;
+
+ /* soneone tried to run the task since the last time we called
+ * func, so we will call one more time regardless of the
+ * return value
+ */
+ case TASK_STATE_ARMED:
+ task->state = TASK_STATE_BUSY;
+ cont = 1;
+ break;
+
+ default:
+ pr_warn("bad state = %d in rxe_do_task\n",
+ task->state);
+ }
+ spin_unlock_irqrestore(&task->state_lock, flags);
+ } while (cont);
+
+ task->ret = ret;
+}
+
+int rxe_init_task(void *obj, struct rxe_task *task,
+ void *arg, int (*func)(void *), char *name)
+{
+ task->obj = obj;
+ task->arg = arg;
+ task->func = func;
+ snprintf(task->name, sizeof(task->name), "%s", name);
+
+ tasklet_init(&task->tasklet, rxe_do_task, (unsigned long)task);
+
+ task->state = TASK_STATE_START;
+ spin_lock_init(&task->state_lock);
+
+ return 0;
+}
+
+void rxe_cleanup_task(struct rxe_task *task)
+{
+ tasklet_kill(&task->tasklet);
+}
+
+void rxe_run_task(struct rxe_task *task, int sched)
+{
+ if (sched)
+ tasklet_schedule(&task->tasklet);
+ else
+ rxe_do_task((unsigned long)task);
+}
+
+void rxe_disable_task(struct rxe_task *task)
+{
+ tasklet_disable(&task->tasklet);
+}
+
+void rxe_enable_task(struct rxe_task *task)
+{
+ tasklet_enable(&task->tasklet);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h
new file mode 100644
index 000000000000..d14aa6daed05
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_task.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_TASK_H
+#define RXE_TASK_H
+
+enum {
+ TASK_STATE_START = 0,
+ TASK_STATE_BUSY = 1,
+ TASK_STATE_ARMED = 2,
+};
+
+/*
+ * data structure to describe a 'task' which is a short
+ * function that returns 0 as long as it needs to be
+ * called again.
+ */
+struct rxe_task {
+ void *obj;
+ struct tasklet_struct tasklet;
+ int state;
+ spinlock_t state_lock; /* spinlock for task state */
+ void *arg;
+ int (*func)(void *arg);
+ int ret;
+ char name[16];
+};
+
+/*
+ * init rxe_task structure
+ * arg => parameter to pass to fcn
+ * fcn => function to call until it returns != 0
+ */
+int rxe_init_task(void *obj, struct rxe_task *task,
+ void *arg, int (*func)(void *), char *name);
+
+/* cleanup task */
+void rxe_cleanup_task(struct rxe_task *task);
+
+/*
+ * raw call to func in loop without any checking
+ * can call when tasklets are disabled
+ */
+int __rxe_do_task(struct rxe_task *task);
+
+/*
+ * common function called by any of the main tasklets
+ * If there is any chance that there is additional
+ * work to do someone must reschedule the task before
+ * leaving
+ */
+void rxe_do_task(unsigned long data);
+
+/* run a task, else schedule it to run as a tasklet, The decision
+ * to run or schedule tasklet is based on the parameter sched.
+ */
+void rxe_run_task(struct rxe_task *task, int sched);
+
+/* keep a task from scheduling */
+void rxe_disable_task(struct rxe_task *task);
+
+/* allow task to run */
+void rxe_enable_task(struct rxe_task *task);
+
+#endif /* RXE_TASK_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
new file mode 100644
index 000000000000..4552be960c6a
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -0,0 +1,1330 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+static int rxe_query_device(struct ib_device *dev,
+ struct ib_device_attr *attr,
+ struct ib_udata *uhw)
+{
+ struct rxe_dev *rxe = to_rdev(dev);
+
+ if (uhw->inlen || uhw->outlen)
+ return -EINVAL;
+
+ *attr = rxe->attr;
+ return 0;
+}
+
+static void rxe_eth_speed_to_ib_speed(int speed, u8 *active_speed,
+ u8 *active_width)
+{
+ if (speed <= 1000) {
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_SDR;
+ } else if (speed <= 10000) {
+ *active_width = IB_WIDTH_1X;
+ *active_speed = IB_SPEED_FDR10;
+ } else if (speed <= 20000) {
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_DDR;
+ } else if (speed <= 30000) {
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_QDR;
+ } else if (speed <= 40000) {
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_FDR10;
+ } else {
+ *active_width = IB_WIDTH_4X;
+ *active_speed = IB_SPEED_EDR;
+ }
+}
+
+static int rxe_query_port(struct ib_device *dev,
+ u8 port_num, struct ib_port_attr *attr)
+{
+ struct rxe_dev *rxe = to_rdev(dev);
+ struct rxe_port *port;
+ u32 speed;
+
+ if (unlikely(port_num != 1)) {
+ pr_warn("invalid port_number %d\n", port_num);
+ goto err1;
+ }
+
+ port = &rxe->port;
+
+ *attr = port->attr;
+
+ mutex_lock(&rxe->usdev_lock);
+ if (rxe->ndev->ethtool_ops->get_link_ksettings) {
+ struct ethtool_link_ksettings ks;
+
+ rxe->ndev->ethtool_ops->get_link_ksettings(rxe->ndev, &ks);
+ speed = ks.base.speed;
+ } else if (rxe->ndev->ethtool_ops->get_settings) {
+ struct ethtool_cmd cmd;
+
+ rxe->ndev->ethtool_ops->get_settings(rxe->ndev, &cmd);
+ speed = cmd.speed;
+ } else {
+ pr_warn("%s speed is unknown, defaulting to 1000\n", rxe->ndev->name);
+ speed = 1000;
+ }
+ rxe_eth_speed_to_ib_speed(speed, &attr->active_speed, &attr->active_width);
+ mutex_unlock(&rxe->usdev_lock);
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static int rxe_query_gid(struct ib_device *device,
+ u8 port_num, int index, union ib_gid *gid)
+{
+ int ret;
+
+ if (index > RXE_PORT_GID_TBL_LEN)
+ return -EINVAL;
+
+ ret = ib_get_cached_gid(device, port_num, index, gid, NULL);
+ if (ret == -EAGAIN) {
+ memcpy(gid, &zgid, sizeof(*gid));
+ return 0;
+ }
+
+ return ret;
+}
+
+static int rxe_add_gid(struct ib_device *device, u8 port_num, unsigned int
+ index, const union ib_gid *gid,
+ const struct ib_gid_attr *attr, void **context)
+{
+ if (index >= RXE_PORT_GID_TBL_LEN)
+ return -EINVAL;
+ return 0;
+}
+
+static int rxe_del_gid(struct ib_device *device, u8 port_num, unsigned int
+ index, void **context)
+{
+ if (index >= RXE_PORT_GID_TBL_LEN)
+ return -EINVAL;
+ return 0;
+}
+
+static struct net_device *rxe_get_netdev(struct ib_device *device,
+ u8 port_num)
+{
+ struct rxe_dev *rxe = to_rdev(device);
+
+ if (rxe->ndev) {
+ dev_hold(rxe->ndev);
+ return rxe->ndev;
+ }
+
+ return NULL;
+}
+
+static int rxe_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey)
+{
+ struct rxe_dev *rxe = to_rdev(device);
+ struct rxe_port *port;
+
+ if (unlikely(port_num != 1)) {
+ dev_warn(device->dma_device, "invalid port_num = %d\n",
+ port_num);
+ goto err1;
+ }
+
+ port = &rxe->port;
+
+ if (unlikely(index >= port->attr.pkey_tbl_len)) {
+ dev_warn(device->dma_device, "invalid index = %d\n",
+ index);
+ goto err1;
+ }
+
+ *pkey = port->pkey_tbl[index];
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static int rxe_modify_device(struct ib_device *dev,
+ int mask, struct ib_device_modify *attr)
+{
+ struct rxe_dev *rxe = to_rdev(dev);
+
+ if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+ rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid);
+
+ if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+ memcpy(rxe->ib_dev.node_desc,
+ attr->node_desc, sizeof(rxe->ib_dev.node_desc));
+ }
+
+ return 0;
+}
+
+static int rxe_modify_port(struct ib_device *dev,
+ u8 port_num, int mask, struct ib_port_modify *attr)
+{
+ struct rxe_dev *rxe = to_rdev(dev);
+ struct rxe_port *port;
+
+ if (unlikely(port_num != 1)) {
+ pr_warn("invalid port_num = %d\n", port_num);
+ goto err1;
+ }
+
+ port = &rxe->port;
+
+ port->attr.port_cap_flags |= attr->set_port_cap_mask;
+ port->attr.port_cap_flags &= ~attr->clr_port_cap_mask;
+
+ if (mask & IB_PORT_RESET_QKEY_CNTR)
+ port->attr.qkey_viol_cntr = 0;
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev,
+ u8 port_num)
+{
+ struct rxe_dev *rxe = to_rdev(dev);
+
+ return rxe->ifc_ops->link_layer(rxe, port_num);
+}
+
+static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev,
+ struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(dev);
+ struct rxe_ucontext *uc;
+
+ uc = rxe_alloc(&rxe->uc_pool);
+ return uc ? &uc->ibuc : ERR_PTR(-ENOMEM);
+}
+
+static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
+{
+ struct rxe_ucontext *uc = to_ruc(ibuc);
+
+ rxe_drop_ref(uc);
+ return 0;
+}
+
+static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
+ struct ib_port_immutable *immutable)
+{
+ int err;
+ struct ib_port_attr attr;
+
+ err = rxe_query_port(dev, port_num, &attr);
+ if (err)
+ return err;
+
+ immutable->pkey_tbl_len = attr.pkey_tbl_len;
+ immutable->gid_tbl_len = attr.gid_tbl_len;
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+ immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+ return 0;
+}
+
+static struct ib_pd *rxe_alloc_pd(struct ib_device *dev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(dev);
+ struct rxe_pd *pd;
+
+ pd = rxe_alloc(&rxe->pd_pool);
+ return pd ? &pd->ibpd : ERR_PTR(-ENOMEM);
+}
+
+static int rxe_dealloc_pd(struct ib_pd *ibpd)
+{
+ struct rxe_pd *pd = to_rpd(ibpd);
+
+ rxe_drop_ref(pd);
+ return 0;
+}
+
+static int rxe_init_av(struct rxe_dev *rxe, struct ib_ah_attr *attr,
+ struct rxe_av *av)
+{
+ int err;
+ union ib_gid sgid;
+ struct ib_gid_attr sgid_attr;
+
+ err = ib_get_cached_gid(&rxe->ib_dev, attr->port_num,
+ attr->grh.sgid_index, &sgid,
+ &sgid_attr);
+ if (err) {
+ pr_err("Failed to query sgid. err = %d\n", err);
+ return err;
+ }
+
+ err = rxe_av_from_attr(rxe, attr->port_num, av, attr);
+ if (!err)
+ err = rxe_av_fill_ip_info(rxe, av, attr, &sgid_attr, &sgid);
+
+ if (sgid_attr.ndev)
+ dev_put(sgid_attr.ndev);
+ return err;
+}
+
+static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_ah *ah;
+
+ err = rxe_av_chk_attr(rxe, attr);
+ if (err)
+ goto err1;
+
+ ah = rxe_alloc(&rxe->ah_pool);
+ if (!ah) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ rxe_add_ref(pd);
+ ah->pd = pd;
+
+ err = rxe_init_av(rxe, attr, &ah->av);
+ if (err)
+ goto err2;
+
+ return &ah->ibah;
+
+err2:
+ rxe_drop_ref(pd);
+ rxe_drop_ref(ah);
+err1:
+ return ERR_PTR(err);
+}
+
+static int rxe_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(ibah->device);
+ struct rxe_ah *ah = to_rah(ibah);
+
+ err = rxe_av_chk_attr(rxe, attr);
+ if (err)
+ return err;
+
+ err = rxe_init_av(rxe, attr, &ah->av);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int rxe_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+ struct rxe_dev *rxe = to_rdev(ibah->device);
+ struct rxe_ah *ah = to_rah(ibah);
+
+ rxe_av_to_attr(rxe, &ah->av, attr);
+ return 0;
+}
+
+static int rxe_destroy_ah(struct ib_ah *ibah)
+{
+ struct rxe_ah *ah = to_rah(ibah);
+
+ rxe_drop_ref(ah->pd);
+ rxe_drop_ref(ah);
+ return 0;
+}
+
+static int post_one_recv(struct rxe_rq *rq, struct ib_recv_wr *ibwr)
+{
+ int err;
+ int i;
+ u32 length;
+ struct rxe_recv_wqe *recv_wqe;
+ int num_sge = ibwr->num_sge;
+
+ if (unlikely(queue_full(rq->queue))) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ if (unlikely(num_sge > rq->max_sge)) {
+ err = -EINVAL;
+ goto err1;
+ }
+
+ length = 0;
+ for (i = 0; i < num_sge; i++)
+ length += ibwr->sg_list[i].length;
+
+ recv_wqe = producer_addr(rq->queue);
+ recv_wqe->wr_id = ibwr->wr_id;
+ recv_wqe->num_sge = num_sge;
+
+ memcpy(recv_wqe->dma.sge, ibwr->sg_list,
+ num_sge * sizeof(struct ib_sge));
+
+ recv_wqe->dma.length = length;
+ recv_wqe->dma.resid = length;
+ recv_wqe->dma.num_sge = num_sge;
+ recv_wqe->dma.cur_sge = 0;
+ recv_wqe->dma.sge_offset = 0;
+
+ /* make sure all changes to the work queue are written before we
+ * update the producer pointer
+ */
+ smp_wmb();
+
+ advance_producer(rq->queue);
+ return 0;
+
+err1:
+ return err;
+}
+
+static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
+ struct ib_srq_init_attr *init,
+ struct ib_udata *udata)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_srq *srq;
+ struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL;
+
+ err = rxe_srq_chk_attr(rxe, NULL, &init->attr, IB_SRQ_INIT_MASK);
+ if (err)
+ goto err1;
+
+ srq = rxe_alloc(&rxe->srq_pool);
+ if (!srq) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ rxe_add_index(srq);
+ rxe_add_ref(pd);
+ srq->pd = pd;
+
+ err = rxe_srq_from_init(rxe, srq, init, context, udata);
+ if (err)
+ goto err2;
+
+ return &srq->ibsrq;
+
+err2:
+ rxe_drop_ref(pd);
+ rxe_drop_index(srq);
+ rxe_drop_ref(srq);
+err1:
+ return ERR_PTR(err);
+}
+
+static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask mask,
+ struct ib_udata *udata)
+{
+ int err;
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+ struct rxe_dev *rxe = to_rdev(ibsrq->device);
+
+ err = rxe_srq_chk_attr(rxe, srq, attr, mask);
+ if (err)
+ goto err1;
+
+ err = rxe_srq_from_attr(rxe, srq, attr, mask, udata);
+ if (err)
+ goto err1;
+
+ return 0;
+
+err1:
+ return err;
+}
+
+static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+
+ if (srq->error)
+ return -EINVAL;
+
+ attr->max_wr = srq->rq.queue->buf->index_mask;
+ attr->max_sge = srq->rq.max_sge;
+ attr->srq_limit = srq->limit;
+ return 0;
+}
+
+static int rxe_destroy_srq(struct ib_srq *ibsrq)
+{
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+
+ if (srq->rq.queue)
+ rxe_queue_cleanup(srq->rq.queue);
+
+ rxe_drop_ref(srq->pd);
+ rxe_drop_index(srq);
+ rxe_drop_ref(srq);
+
+ return 0;
+}
+
+static int rxe_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ int err = 0;
+ unsigned long flags;
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+
+ spin_lock_irqsave(&srq->rq.producer_lock, flags);
+
+ while (wr) {
+ err = post_one_recv(&srq->rq, wr);
+ if (unlikely(err))
+ break;
+ wr = wr->next;
+ }
+
+ spin_unlock_irqrestore(&srq->rq.producer_lock, flags);
+
+ if (err)
+ *bad_wr = wr;
+
+ return err;
+}
+
+static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd,
+ struct ib_qp_init_attr *init,
+ struct ib_udata *udata)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_qp *qp;
+
+ err = rxe_qp_chk_init(rxe, init);
+ if (err)
+ goto err1;
+
+ qp = rxe_alloc(&rxe->qp_pool);
+ if (!qp) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ if (udata) {
+ if (udata->inlen) {
+ err = -EINVAL;
+ goto err1;
+ }
+ qp->is_user = 1;
+ }
+
+ rxe_add_index(qp);
+
+ err = rxe_qp_from_init(rxe, qp, pd, init, udata, ibpd);
+ if (err)
+ goto err2;
+
+ return &qp->ibqp;
+
+err2:
+ rxe_drop_index(qp);
+ rxe_drop_ref(qp);
+err1:
+ return ERR_PTR(err);
+}
+
+static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int mask, struct ib_udata *udata)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(ibqp->device);
+ struct rxe_qp *qp = to_rqp(ibqp);
+
+ err = rxe_qp_chk_attr(rxe, qp, attr, mask);
+ if (err)
+ goto err1;
+
+ err = rxe_qp_from_attr(qp, attr, mask, udata);
+ if (err)
+ goto err1;
+
+ return 0;
+
+err1:
+ return err;
+}
+
+static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int mask, struct ib_qp_init_attr *init)
+{
+ struct rxe_qp *qp = to_rqp(ibqp);
+
+ rxe_qp_to_init(qp, init);
+ rxe_qp_to_attr(qp, attr, mask);
+
+ return 0;
+}
+
+static int rxe_destroy_qp(struct ib_qp *ibqp)
+{
+ struct rxe_qp *qp = to_rqp(ibqp);
+
+ rxe_qp_destroy(qp);
+ rxe_drop_index(qp);
+ rxe_drop_ref(qp);
+ return 0;
+}
+
+static int validate_send_wr(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+ unsigned int mask, unsigned int length)
+{
+ int num_sge = ibwr->num_sge;
+ struct rxe_sq *sq = &qp->sq;
+
+ if (unlikely(num_sge > sq->max_sge))
+ goto err1;
+
+ if (unlikely(mask & WR_ATOMIC_MASK)) {
+ if (length < 8)
+ goto err1;
+
+ if (atomic_wr(ibwr)->remote_addr & 0x7)
+ goto err1;
+ }
+
+ if (unlikely((ibwr->send_flags & IB_SEND_INLINE) &&
+ (length > sq->max_inline)))
+ goto err1;
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr,
+ struct ib_send_wr *ibwr)
+{
+ wr->wr_id = ibwr->wr_id;
+ wr->num_sge = ibwr->num_sge;
+ wr->opcode = ibwr->opcode;
+ wr->send_flags = ibwr->send_flags;
+
+ if (qp_type(qp) == IB_QPT_UD ||
+ qp_type(qp) == IB_QPT_SMI ||
+ qp_type(qp) == IB_QPT_GSI) {
+ wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn;
+ wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey;
+ if (qp_type(qp) == IB_QPT_GSI)
+ wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index;
+ if (wr->opcode == IB_WR_SEND_WITH_IMM)
+ wr->ex.imm_data = ibwr->ex.imm_data;
+ } else {
+ switch (wr->opcode) {
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ wr->ex.imm_data = ibwr->ex.imm_data;
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_WRITE:
+ wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr;
+ wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ wr->ex.imm_data = ibwr->ex.imm_data;
+ break;
+ case IB_WR_SEND_WITH_INV:
+ wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+ break;
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ wr->wr.atomic.remote_addr =
+ atomic_wr(ibwr)->remote_addr;
+ wr->wr.atomic.compare_add =
+ atomic_wr(ibwr)->compare_add;
+ wr->wr.atomic.swap = atomic_wr(ibwr)->swap;
+ wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey;
+ break;
+ case IB_WR_LOCAL_INV:
+ wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+ break;
+ case IB_WR_REG_MR:
+ wr->wr.reg.mr = reg_wr(ibwr)->mr;
+ wr->wr.reg.key = reg_wr(ibwr)->key;
+ wr->wr.reg.access = reg_wr(ibwr)->access;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+ unsigned int mask, unsigned int length,
+ struct rxe_send_wqe *wqe)
+{
+ int num_sge = ibwr->num_sge;
+ struct ib_sge *sge;
+ int i;
+ u8 *p;
+
+ init_send_wr(qp, &wqe->wr, ibwr);
+
+ if (qp_type(qp) == IB_QPT_UD ||
+ qp_type(qp) == IB_QPT_SMI ||
+ qp_type(qp) == IB_QPT_GSI)
+ memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av));
+
+ if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) {
+ p = wqe->dma.inline_data;
+
+ sge = ibwr->sg_list;
+ for (i = 0; i < num_sge; i++, sge++) {
+ if (qp->is_user && copy_from_user(p, (__user void *)
+ (uintptr_t)sge->addr, sge->length))
+ return -EFAULT;
+
+ else if (!qp->is_user)
+ memcpy(p, (void *)(uintptr_t)sge->addr,
+ sge->length);
+
+ p += sge->length;
+ }
+ } else if (mask & WR_REG_MASK) {
+ wqe->mask = mask;
+ wqe->state = wqe_state_posted;
+ return 0;
+ } else
+ memcpy(wqe->dma.sge, ibwr->sg_list,
+ num_sge * sizeof(struct ib_sge));
+
+ wqe->iova = (mask & WR_ATOMIC_MASK) ?
+ atomic_wr(ibwr)->remote_addr :
+ rdma_wr(ibwr)->remote_addr;
+ wqe->mask = mask;
+ wqe->dma.length = length;
+ wqe->dma.resid = length;
+ wqe->dma.num_sge = num_sge;
+ wqe->dma.cur_sge = 0;
+ wqe->dma.sge_offset = 0;
+ wqe->state = wqe_state_posted;
+ wqe->ssn = atomic_add_return(1, &qp->ssn);
+
+ return 0;
+}
+
+static int post_one_send(struct rxe_qp *qp, struct ib_send_wr *ibwr,
+ unsigned mask, u32 length)
+{
+ int err;
+ struct rxe_sq *sq = &qp->sq;
+ struct rxe_send_wqe *send_wqe;
+ unsigned long flags;
+
+ err = validate_send_wr(qp, ibwr, mask, length);
+ if (err)
+ return err;
+
+ spin_lock_irqsave(&qp->sq.sq_lock, flags);
+
+ if (unlikely(queue_full(sq->queue))) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ send_wqe = producer_addr(sq->queue);
+
+ err = init_send_wqe(qp, ibwr, mask, length, send_wqe);
+ if (unlikely(err))
+ goto err1;
+
+ /*
+ * make sure all changes to the work queue are
+ * written before we update the producer pointer
+ */
+ smp_wmb();
+
+ advance_producer(sq->queue);
+ spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
+
+ return 0;
+
+err1:
+ spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
+ return err;
+}
+
+static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ int err = 0;
+ struct rxe_qp *qp = to_rqp(ibqp);
+ unsigned int mask;
+ unsigned int length = 0;
+ int i;
+ int must_sched;
+
+ if (unlikely(!qp->valid)) {
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+
+ if (unlikely(qp->req.state < QP_STATE_READY)) {
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+
+ while (wr) {
+ mask = wr_opcode_mask(wr->opcode, qp);
+ if (unlikely(!mask)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ break;
+ }
+
+ if (unlikely((wr->send_flags & IB_SEND_INLINE) &&
+ !(mask & WR_INLINE_MASK))) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ break;
+ }
+
+ length = 0;
+ for (i = 0; i < wr->num_sge; i++)
+ length += wr->sg_list[i].length;
+
+ err = post_one_send(qp, wr, mask, length);
+
+ if (err) {
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+
+ /*
+ * Must sched in case of GSI QP because ib_send_mad() hold irq lock,
+ * and the requester call ip_local_out_sk() that takes spin_lock_bh.
+ */
+ must_sched = (qp_type(qp) == IB_QPT_GSI) ||
+ (queue_count(qp->sq.queue) > 1);
+
+ rxe_run_task(&qp->req.task, must_sched);
+
+ return err;
+}
+
+static int rxe_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ int err = 0;
+ struct rxe_qp *qp = to_rqp(ibqp);
+ struct rxe_rq *rq = &qp->rq;
+ unsigned long flags;
+
+ if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) {
+ *bad_wr = wr;
+ err = -EINVAL;
+ goto err1;
+ }
+
+ if (unlikely(qp->srq)) {
+ *bad_wr = wr;
+ err = -EINVAL;
+ goto err1;
+ }
+
+ spin_lock_irqsave(&rq->producer_lock, flags);
+
+ while (wr) {
+ err = post_one_recv(rq, wr);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+
+ spin_unlock_irqrestore(&rq->producer_lock, flags);
+
+err1:
+ return err;
+}
+
+static struct ib_cq *rxe_create_cq(struct ib_device *dev,
+ const struct ib_cq_init_attr *attr,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(dev);
+ struct rxe_cq *cq;
+
+ if (attr->flags)
+ return ERR_PTR(-EINVAL);
+
+ err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector, udata);
+ if (err)
+ goto err1;
+
+ cq = rxe_alloc(&rxe->cq_pool);
+ if (!cq) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector,
+ context, udata);
+ if (err)
+ goto err2;
+
+ return &cq->ibcq;
+
+err2:
+ rxe_drop_ref(cq);
+err1:
+ return ERR_PTR(err);
+}
+
+static int rxe_destroy_cq(struct ib_cq *ibcq)
+{
+ struct rxe_cq *cq = to_rcq(ibcq);
+
+ rxe_drop_ref(cq);
+ return 0;
+}
+
+static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+ int err;
+ struct rxe_cq *cq = to_rcq(ibcq);
+ struct rxe_dev *rxe = to_rdev(ibcq->device);
+
+ err = rxe_cq_chk_attr(rxe, cq, cqe, 0, udata);
+ if (err)
+ goto err1;
+
+ err = rxe_cq_resize_queue(cq, cqe, udata);
+ if (err)
+ goto err1;
+
+ return 0;
+
+err1:
+ return err;
+}
+
+static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+ int i;
+ struct rxe_cq *cq = to_rcq(ibcq);
+ struct rxe_cqe *cqe;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->cq_lock, flags);
+ for (i = 0; i < num_entries; i++) {
+ cqe = queue_head(cq->queue);
+ if (!cqe)
+ break;
+
+ memcpy(wc++, &cqe->ibwc, sizeof(*wc));
+ advance_consumer(cq->queue);
+ }
+ spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+ return i;
+}
+
+static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt)
+{
+ struct rxe_cq *cq = to_rcq(ibcq);
+ int count = queue_count(cq->queue);
+
+ return (count > wc_cnt) ? wc_cnt : count;
+}
+
+static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+ struct rxe_cq *cq = to_rcq(ibcq);
+
+ if (cq->notify != IB_CQ_NEXT_COMP)
+ cq->notify = flags & IB_CQ_SOLICITED_MASK;
+
+ return 0;
+}
+
+static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
+{
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_mem *mr;
+ int err;
+
+ mr = rxe_alloc(&rxe->mr_pool);
+ if (!mr) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ rxe_add_index(mr);
+
+ rxe_add_ref(pd);
+
+ err = rxe_mem_init_dma(rxe, pd, access, mr);
+ if (err)
+ goto err2;
+
+ return &mr->ibmr;
+
+err2:
+ rxe_drop_ref(pd);
+ rxe_drop_index(mr);
+ rxe_drop_ref(mr);
+err1:
+ return ERR_PTR(err);
+}
+
+static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
+ u64 start,
+ u64 length,
+ u64 iova,
+ int access, struct ib_udata *udata)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_mem *mr;
+
+ mr = rxe_alloc(&rxe->mr_pool);
+ if (!mr) {
+ err = -ENOMEM;
+ goto err2;
+ }
+
+ rxe_add_index(mr);
+
+ rxe_add_ref(pd);
+
+ err = rxe_mem_init_user(rxe, pd, start, length, iova,
+ access, udata, mr);
+ if (err)
+ goto err3;
+
+ return &mr->ibmr;
+
+err3:
+ rxe_drop_ref(pd);
+ rxe_drop_index(mr);
+ rxe_drop_ref(mr);
+err2:
+ return ERR_PTR(err);
+}
+
+static int rxe_dereg_mr(struct ib_mr *ibmr)
+{
+ struct rxe_mem *mr = to_rmr(ibmr);
+
+ mr->state = RXE_MEM_STATE_ZOMBIE;
+ rxe_drop_ref(mr->pd);
+ rxe_drop_index(mr);
+ rxe_drop_ref(mr);
+ return 0;
+}
+
+static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_mem *mr;
+ int err;
+
+ if (mr_type != IB_MR_TYPE_MEM_REG)
+ return ERR_PTR(-EINVAL);
+
+ mr = rxe_alloc(&rxe->mr_pool);
+ if (!mr) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ rxe_add_index(mr);
+
+ rxe_add_ref(pd);
+
+ err = rxe_mem_init_fast(rxe, pd, max_num_sg, mr);
+ if (err)
+ goto err2;
+
+ return &mr->ibmr;
+
+err2:
+ rxe_drop_ref(pd);
+ rxe_drop_index(mr);
+ rxe_drop_ref(mr);
+err1:
+ return ERR_PTR(err);
+}
+
+static int rxe_set_page(struct ib_mr *ibmr, u64 addr)
+{
+ struct rxe_mem *mr = to_rmr(ibmr);
+ struct rxe_map *map;
+ struct rxe_phys_buf *buf;
+
+ if (unlikely(mr->nbuf == mr->num_buf))
+ return -ENOMEM;
+
+ map = mr->map[mr->nbuf / RXE_BUF_PER_MAP];
+ buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP];
+
+ buf->addr = addr;
+ buf->size = ibmr->page_size;
+ mr->nbuf++;
+
+ return 0;
+}
+
+static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset)
+{
+ struct rxe_mem *mr = to_rmr(ibmr);
+ int n;
+
+ mr->nbuf = 0;
+
+ n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page);
+
+ mr->va = ibmr->iova;
+ mr->iova = ibmr->iova;
+ mr->length = ibmr->length;
+ mr->page_shift = ilog2(ibmr->page_size);
+ mr->page_mask = ibmr->page_size - 1;
+ mr->offset = mr->iova & mr->page_mask;
+
+ return n;
+}
+
+static int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(ibqp->device);
+ struct rxe_qp *qp = to_rqp(ibqp);
+ struct rxe_mc_grp *grp;
+
+ /* takes a ref on grp if successful */
+ err = rxe_mcast_get_grp(rxe, mgid, &grp);
+ if (err)
+ return err;
+
+ err = rxe_mcast_add_grp_elem(rxe, qp, grp);
+
+ rxe_drop_ref(grp);
+ return err;
+}
+
+static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+ struct rxe_dev *rxe = to_rdev(ibqp->device);
+ struct rxe_qp *qp = to_rqp(ibqp);
+
+ return rxe_mcast_drop_grp_elem(rxe, qp, mgid);
+}
+
+static ssize_t rxe_show_parent(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct rxe_dev *rxe = container_of(device, struct rxe_dev,
+ ib_dev.dev);
+ char *name;
+
+ name = rxe->ifc_ops->parent_name(rxe, 1);
+ return snprintf(buf, 16, "%s\n", name);
+}
+
+static DEVICE_ATTR(parent, S_IRUGO, rxe_show_parent, NULL);
+
+static struct device_attribute *rxe_dev_attributes[] = {
+ &dev_attr_parent,
+};
+
+int rxe_register_device(struct rxe_dev *rxe)
+{
+ int err;
+ int i;
+ struct ib_device *dev = &rxe->ib_dev;
+
+ strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX);
+ strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc));
+
+ dev->owner = THIS_MODULE;
+ dev->node_type = RDMA_NODE_IB_CA;
+ dev->phys_port_cnt = 1;
+ dev->num_comp_vectors = RXE_NUM_COMP_VECTORS;
+ dev->dma_device = rxe->ifc_ops->dma_device(rxe);
+ dev->local_dma_lkey = 0;
+ dev->node_guid = rxe->ifc_ops->node_guid(rxe);
+ dev->dma_ops = &rxe_dma_mapping_ops;
+
+ dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION;
+ dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
+ | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
+ | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
+ | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT)
+ | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD)
+ | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD)
+ | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_POST_SRQ_RECV)
+ | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP)
+ | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP)
+ | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP)
+ | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP)
+ | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND)
+ | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV)
+ | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_PEEK_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_REG_MR)
+ | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR)
+ | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH)
+ | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH)
+ | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH)
+ | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH)
+ | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST)
+ | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST)
+ ;
+
+ dev->query_device = rxe_query_device;
+ dev->modify_device = rxe_modify_device;
+ dev->query_port = rxe_query_port;
+ dev->modify_port = rxe_modify_port;
+ dev->get_link_layer = rxe_get_link_layer;
+ dev->query_gid = rxe_query_gid;
+ dev->get_netdev = rxe_get_netdev;
+ dev->add_gid = rxe_add_gid;
+ dev->del_gid = rxe_del_gid;
+ dev->query_pkey = rxe_query_pkey;
+ dev->alloc_ucontext = rxe_alloc_ucontext;
+ dev->dealloc_ucontext = rxe_dealloc_ucontext;
+ dev->mmap = rxe_mmap;
+ dev->get_port_immutable = rxe_port_immutable;
+ dev->alloc_pd = rxe_alloc_pd;
+ dev->dealloc_pd = rxe_dealloc_pd;
+ dev->create_ah = rxe_create_ah;
+ dev->modify_ah = rxe_modify_ah;
+ dev->query_ah = rxe_query_ah;
+ dev->destroy_ah = rxe_destroy_ah;
+ dev->create_srq = rxe_create_srq;
+ dev->modify_srq = rxe_modify_srq;
+ dev->query_srq = rxe_query_srq;
+ dev->destroy_srq = rxe_destroy_srq;
+ dev->post_srq_recv = rxe_post_srq_recv;
+ dev->create_qp = rxe_create_qp;
+ dev->modify_qp = rxe_modify_qp;
+ dev->query_qp = rxe_query_qp;
+ dev->destroy_qp = rxe_destroy_qp;
+ dev->post_send = rxe_post_send;
+ dev->post_recv = rxe_post_recv;
+ dev->create_cq = rxe_create_cq;
+ dev->destroy_cq = rxe_destroy_cq;
+ dev->resize_cq = rxe_resize_cq;
+ dev->poll_cq = rxe_poll_cq;
+ dev->peek_cq = rxe_peek_cq;
+ dev->req_notify_cq = rxe_req_notify_cq;
+ dev->get_dma_mr = rxe_get_dma_mr;
+ dev->reg_user_mr = rxe_reg_user_mr;
+ dev->dereg_mr = rxe_dereg_mr;
+ dev->alloc_mr = rxe_alloc_mr;
+ dev->map_mr_sg = rxe_map_mr_sg;
+ dev->attach_mcast = rxe_attach_mcast;
+ dev->detach_mcast = rxe_detach_mcast;
+
+ err = ib_register_device(dev, NULL);
+ if (err) {
+ pr_warn("rxe_register_device failed, err = %d\n", err);
+ goto err1;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) {
+ err = device_create_file(&dev->dev, rxe_dev_attributes[i]);
+ if (err) {
+ pr_warn("device_create_file failed, i = %d, err = %d\n",
+ i, err);
+ goto err2;
+ }
+ }
+
+ return 0;
+
+err2:
+ ib_unregister_device(dev);
+err1:
+ return err;
+}
+
+int rxe_unregister_device(struct rxe_dev *rxe)
+{
+ int i;
+ struct ib_device *dev = &rxe->ib_dev;
+
+ for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i)
+ device_remove_file(&dev->dev, rxe_dev_attributes[i]);
+
+ ib_unregister_device(dev);
+
+ return 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
new file mode 100644
index 000000000000..cac1d52a08f0
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RXE_VERBS_H
+#define RXE_VERBS_H
+
+#include <linux/interrupt.h>
+#include <rdma/rdma_user_rxe.h>
+#include "rxe_pool.h"
+#include "rxe_task.h"
+
+static inline int pkey_match(u16 key1, u16 key2)
+{
+ return (((key1 & 0x7fff) != 0) &&
+ ((key1 & 0x7fff) == (key2 & 0x7fff)) &&
+ ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0;
+}
+
+/* Return >0 if psn_a > psn_b
+ * 0 if psn_a == psn_b
+ * <0 if psn_a < psn_b
+ */
+static inline int psn_compare(u32 psn_a, u32 psn_b)
+{
+ s32 diff;
+
+ diff = (psn_a - psn_b) << 8;
+ return diff;
+}
+
+struct rxe_ucontext {
+ struct rxe_pool_entry pelem;
+ struct ib_ucontext ibuc;
+};
+
+struct rxe_pd {
+ struct rxe_pool_entry pelem;
+ struct ib_pd ibpd;
+};
+
+struct rxe_ah {
+ struct rxe_pool_entry pelem;
+ struct ib_ah ibah;
+ struct rxe_pd *pd;
+ struct rxe_av av;
+};
+
+struct rxe_cqe {
+ union {
+ struct ib_wc ibwc;
+ struct ib_uverbs_wc uibwc;
+ };
+};
+
+struct rxe_cq {
+ struct rxe_pool_entry pelem;
+ struct ib_cq ibcq;
+ struct rxe_queue *queue;
+ spinlock_t cq_lock;
+ u8 notify;
+ int is_user;
+ struct tasklet_struct comp_task;
+};
+
+enum wqe_state {
+ wqe_state_posted,
+ wqe_state_processing,
+ wqe_state_pending,
+ wqe_state_done,
+ wqe_state_error,
+};
+
+struct rxe_sq {
+ int max_wr;
+ int max_sge;
+ int max_inline;
+ spinlock_t sq_lock; /* guard queue */
+ struct rxe_queue *queue;
+};
+
+struct rxe_rq {
+ int max_wr;
+ int max_sge;
+ spinlock_t producer_lock; /* guard queue producer */
+ spinlock_t consumer_lock; /* guard queue consumer */
+ struct rxe_queue *queue;
+};
+
+struct rxe_srq {
+ struct rxe_pool_entry pelem;
+ struct ib_srq ibsrq;
+ struct rxe_pd *pd;
+ struct rxe_rq rq;
+ u32 srq_num;
+
+ int limit;
+ int error;
+};
+
+enum rxe_qp_state {
+ QP_STATE_RESET,
+ QP_STATE_INIT,
+ QP_STATE_READY,
+ QP_STATE_DRAIN, /* req only */
+ QP_STATE_DRAINED, /* req only */
+ QP_STATE_ERROR
+};
+
+extern char *rxe_qp_state_name[];
+
+struct rxe_req_info {
+ enum rxe_qp_state state;
+ int wqe_index;
+ u32 psn;
+ int opcode;
+ atomic_t rd_atomic;
+ int wait_fence;
+ int need_rd_atomic;
+ int wait_psn;
+ int need_retry;
+ int noack_pkts;
+ struct rxe_task task;
+};
+
+struct rxe_comp_info {
+ u32 psn;
+ int opcode;
+ int timeout;
+ int timeout_retry;
+ u32 retry_cnt;
+ u32 rnr_retry;
+ struct rxe_task task;
+};
+
+enum rdatm_res_state {
+ rdatm_res_state_next,
+ rdatm_res_state_new,
+ rdatm_res_state_replay,
+};
+
+struct resp_res {
+ int type;
+ u32 first_psn;
+ u32 last_psn;
+ u32 cur_psn;
+ enum rdatm_res_state state;
+
+ union {
+ struct {
+ struct sk_buff *skb;
+ } atomic;
+ struct {
+ struct rxe_mem *mr;
+ u64 va_org;
+ u32 rkey;
+ u32 length;
+ u64 va;
+ u32 resid;
+ } read;
+ };
+};
+
+struct rxe_resp_info {
+ enum rxe_qp_state state;
+ u32 msn;
+ u32 psn;
+ int opcode;
+ int drop_msg;
+ int goto_error;
+ int sent_psn_nak;
+ enum ib_wc_status status;
+ u8 aeth_syndrome;
+
+ /* Receive only */
+ struct rxe_recv_wqe *wqe;
+
+ /* RDMA read / atomic only */
+ u64 va;
+ struct rxe_mem *mr;
+ u32 resid;
+ u32 rkey;
+ u64 atomic_orig;
+
+ /* SRQ only */
+ struct {
+ struct rxe_recv_wqe wqe;
+ struct ib_sge sge[RXE_MAX_SGE];
+ } srq_wqe;
+
+ /* Responder resources. It's a circular list where the oldest
+ * resource is dropped first.
+ */
+ struct resp_res *resources;
+ unsigned int res_head;
+ unsigned int res_tail;
+ struct resp_res *res;
+ struct rxe_task task;
+};
+
+struct rxe_qp {
+ struct rxe_pool_entry pelem;
+ struct ib_qp ibqp;
+ struct ib_qp_attr attr;
+ unsigned int valid;
+ unsigned int mtu;
+ int is_user;
+
+ struct rxe_pd *pd;
+ struct rxe_srq *srq;
+ struct rxe_cq *scq;
+ struct rxe_cq *rcq;
+
+ enum ib_sig_type sq_sig_type;
+
+ struct rxe_sq sq;
+ struct rxe_rq rq;
+
+ struct socket *sk;
+
+ struct rxe_av pri_av;
+ struct rxe_av alt_av;
+
+ /* list of mcast groups qp has joined (for cleanup) */
+ struct list_head grp_list;
+ spinlock_t grp_lock; /* guard grp_list */
+
+ struct sk_buff_head req_pkts;
+ struct sk_buff_head resp_pkts;
+ struct sk_buff_head send_pkts;
+
+ struct rxe_req_info req;
+ struct rxe_comp_info comp;
+ struct rxe_resp_info resp;
+
+ atomic_t ssn;
+ atomic_t skb_out;
+ int need_req_skb;
+
+ /* Timer for retranmitting packet when ACKs have been lost. RC
+ * only. The requester sets it when it is not already
+ * started. The responder resets it whenever an ack is
+ * received.
+ */
+ struct timer_list retrans_timer;
+ u64 qp_timeout_jiffies;
+
+ /* Timer for handling RNR NAKS. */
+ struct timer_list rnr_nak_timer;
+
+ spinlock_t state_lock; /* guard requester and completer */
+};
+
+enum rxe_mem_state {
+ RXE_MEM_STATE_ZOMBIE,
+ RXE_MEM_STATE_INVALID,
+ RXE_MEM_STATE_FREE,
+ RXE_MEM_STATE_VALID,
+};
+
+enum rxe_mem_type {
+ RXE_MEM_TYPE_NONE,
+ RXE_MEM_TYPE_DMA,
+ RXE_MEM_TYPE_MR,
+ RXE_MEM_TYPE_FMR,
+ RXE_MEM_TYPE_MW,
+};
+
+#define RXE_BUF_PER_MAP (PAGE_SIZE / sizeof(struct rxe_phys_buf))
+
+struct rxe_phys_buf {
+ u64 addr;
+ u64 size;
+};
+
+struct rxe_map {
+ struct rxe_phys_buf buf[RXE_BUF_PER_MAP];
+};
+
+struct rxe_mem {
+ struct rxe_pool_entry pelem;
+ union {
+ struct ib_mr ibmr;
+ struct ib_mw ibmw;
+ };
+
+ struct rxe_pd *pd;
+ struct ib_umem *umem;
+
+ u32 lkey;
+ u32 rkey;
+
+ enum rxe_mem_state state;
+ enum rxe_mem_type type;
+ u64 va;
+ u64 iova;
+ size_t length;
+ u32 offset;
+ int access;
+
+ int page_shift;
+ int page_mask;
+ int map_shift;
+ int map_mask;
+
+ u32 num_buf;
+ u32 nbuf;
+
+ u32 max_buf;
+ u32 num_map;
+
+ struct rxe_map **map;
+};
+
+struct rxe_mc_grp {
+ struct rxe_pool_entry pelem;
+ spinlock_t mcg_lock; /* guard group */
+ struct rxe_dev *rxe;
+ struct list_head qp_list;
+ union ib_gid mgid;
+ int num_qp;
+ u32 qkey;
+ u16 pkey;
+};
+
+struct rxe_mc_elem {
+ struct rxe_pool_entry pelem;
+ struct list_head qp_list;
+ struct list_head grp_list;
+ struct rxe_qp *qp;
+ struct rxe_mc_grp *grp;
+};
+
+struct rxe_port {
+ struct ib_port_attr attr;
+ u16 *pkey_tbl;
+ __be64 port_guid;
+ __be64 subnet_prefix;
+ spinlock_t port_lock; /* guard port */
+ unsigned int mtu_cap;
+ /* special QPs */
+ u32 qp_smi_index;
+ u32 qp_gsi_index;
+};
+
+/* callbacks from rdma_rxe to network interface layer */
+struct rxe_ifc_ops {
+ void (*release)(struct rxe_dev *rxe);
+ __be64 (*node_guid)(struct rxe_dev *rxe);
+ __be64 (*port_guid)(struct rxe_dev *rxe);
+ struct device *(*dma_device)(struct rxe_dev *rxe);
+ int (*mcast_add)(struct rxe_dev *rxe, union ib_gid *mgid);
+ int (*mcast_delete)(struct rxe_dev *rxe, union ib_gid *mgid);
+ int (*prepare)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb, u32 *crc);
+ int (*send)(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb);
+ int (*loopback)(struct sk_buff *skb);
+ struct sk_buff *(*init_packet)(struct rxe_dev *rxe, struct rxe_av *av,
+ int paylen, struct rxe_pkt_info *pkt);
+ char *(*parent_name)(struct rxe_dev *rxe, unsigned int port_num);
+ enum rdma_link_layer (*link_layer)(struct rxe_dev *rxe,
+ unsigned int port_num);
+};
+
+struct rxe_dev {
+ struct ib_device ib_dev;
+ struct ib_device_attr attr;
+ int max_ucontext;
+ int max_inline_data;
+ struct kref ref_cnt;
+ struct mutex usdev_lock;
+
+ struct rxe_ifc_ops *ifc_ops;
+
+ struct net_device *ndev;
+
+ int xmit_errors;
+
+ struct rxe_pool uc_pool;
+ struct rxe_pool pd_pool;
+ struct rxe_pool ah_pool;
+ struct rxe_pool srq_pool;
+ struct rxe_pool qp_pool;
+ struct rxe_pool cq_pool;
+ struct rxe_pool mr_pool;
+ struct rxe_pool mw_pool;
+ struct rxe_pool mc_grp_pool;
+ struct rxe_pool mc_elem_pool;
+
+ spinlock_t pending_lock; /* guard pending_mmaps */
+ struct list_head pending_mmaps;
+
+ spinlock_t mmap_offset_lock; /* guard mmap_offset */
+ int mmap_offset;
+
+ struct rxe_port port;
+ struct list_head list;
+};
+
+static inline struct rxe_dev *to_rdev(struct ib_device *dev)
+{
+ return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL;
+}
+
+static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc)
+{
+ return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL;
+}
+
+static inline struct rxe_pd *to_rpd(struct ib_pd *pd)
+{
+ return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL;
+}
+
+static inline struct rxe_ah *to_rah(struct ib_ah *ah)
+{
+ return ah ? container_of(ah, struct rxe_ah, ibah) : NULL;
+}
+
+static inline struct rxe_srq *to_rsrq(struct ib_srq *srq)
+{
+ return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL;
+}
+
+static inline struct rxe_qp *to_rqp(struct ib_qp *qp)
+{
+ return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL;
+}
+
+static inline struct rxe_cq *to_rcq(struct ib_cq *cq)
+{
+ return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL;
+}
+
+static inline struct rxe_mem *to_rmr(struct ib_mr *mr)
+{
+ return mr ? container_of(mr, struct rxe_mem, ibmr) : NULL;
+}
+
+static inline struct rxe_mem *to_rmw(struct ib_mw *mw)
+{
+ return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL;
+}
+
+int rxe_register_device(struct rxe_dev *rxe);
+int rxe_unregister_device(struct rxe_dev *rxe);
+
+void rxe_mc_cleanup(void *arg);
+
+#endif /* RXE_VERBS_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index caec8e9c4666..9dbfcc0ab577 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -92,6 +92,9 @@ enum {
IPOIB_FLAG_UMCAST = 10,
IPOIB_STOP_NEIGH_GC = 11,
IPOIB_NEIGH_TBL_FLUSH = 12,
+ IPOIB_FLAG_DEV_ADDR_SET = 13,
+ IPOIB_FLAG_DEV_ADDR_CTRL = 14,
+ IPOIB_FLAG_GOING_DOWN = 15,
IPOIB_MAX_BACKOFF_SECONDS = 16,
@@ -392,6 +395,7 @@ struct ipoib_dev_priv {
struct ipoib_ethtool_st ethtool;
struct timer_list poll_timer;
unsigned max_send_sge;
+ bool sm_fullmember_sendonly_support;
};
struct ipoib_ah {
@@ -474,8 +478,10 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
struct ipoib_ah *address, u32 qpn);
void ipoib_reap_ah(struct work_struct *work);
+struct ipoib_path *__path_find(struct net_device *dev, void *gid);
void ipoib_mark_paths_invalid(struct net_device *dev);
void ipoib_flush_paths(struct net_device *dev);
+int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv);
struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index c8ed53562c9b..4ad297d3de89 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -766,7 +766,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(skb);
} else {
- dev->trans_start = jiffies;
+ netif_trans_update(dev);
++tx->tx_head;
if (++priv->tx_outstanding == ipoib_sendq_size) {
@@ -1318,6 +1318,8 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
}
}
+#define QPN_AND_OPTIONS_OFFSET 4
+
static void ipoib_cm_tx_start(struct work_struct *work)
{
struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
@@ -1326,6 +1328,7 @@ static void ipoib_cm_tx_start(struct work_struct *work)
struct ipoib_neigh *neigh;
struct ipoib_cm_tx *p;
unsigned long flags;
+ struct ipoib_path *path;
int ret;
struct ib_sa_path_rec pathrec;
@@ -1338,7 +1341,19 @@ static void ipoib_cm_tx_start(struct work_struct *work)
p = list_entry(priv->cm.start_list.next, typeof(*p), list);
list_del_init(&p->list);
neigh = p->neigh;
+
qpn = IPOIB_QPN(neigh->daddr);
+ /*
+ * As long as the search is with these 2 locks,
+ * path existence indicates its validity.
+ */
+ path = __path_find(dev, neigh->daddr + QPN_AND_OPTIONS_OFFSET);
+ if (!path) {
+ pr_info("%s ignore not valid path %pI6\n",
+ __func__,
+ neigh->daddr + QPN_AND_OPTIONS_OFFSET);
+ goto free_neigh;
+ }
memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
spin_unlock_irqrestore(&priv->lock, flags);
@@ -1350,6 +1365,7 @@ static void ipoib_cm_tx_start(struct work_struct *work)
spin_lock_irqsave(&priv->lock, flags);
if (ret) {
+free_neigh:
neigh = p->neigh;
if (neigh) {
neigh->cm = NULL;
@@ -1486,6 +1502,10 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr,
{
struct net_device *dev = to_net_dev(d);
int ret;
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+ if (test_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags))
+ return -EPERM;
if (!rtnl_trylock())
return restart_syscall();
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index a53fa5fc0dec..7b6d40ff1acf 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -36,15 +36,34 @@
#include "ipoib.h"
+struct ipoib_stats {
+ char stat_string[ETH_GSTRING_LEN];
+ int stat_offset;
+};
+
+#define IPOIB_NETDEV_STAT(m) { \
+ .stat_string = #m, \
+ .stat_offset = offsetof(struct rtnl_link_stats64, m) }
+
+static const struct ipoib_stats ipoib_gstrings_stats[] = {
+ IPOIB_NETDEV_STAT(rx_packets),
+ IPOIB_NETDEV_STAT(tx_packets),
+ IPOIB_NETDEV_STAT(rx_bytes),
+ IPOIB_NETDEV_STAT(tx_bytes),
+ IPOIB_NETDEV_STAT(tx_errors),
+ IPOIB_NETDEV_STAT(rx_dropped),
+ IPOIB_NETDEV_STAT(tx_dropped)
+};
+
+#define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats)
+
static void ipoib_get_drvinfo(struct net_device *netdev,
struct ethtool_drvinfo *drvinfo)
{
struct ipoib_dev_priv *priv = netdev_priv(netdev);
- snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
- "%d.%d.%d", (int)(priv->ca->attrs.fw_ver >> 32),
- (int)(priv->ca->attrs.fw_ver >> 16) & 0xffff,
- (int)priv->ca->attrs.fw_ver & 0xffff);
+ ib_get_device_fw_str(priv->ca, drvinfo->fw_version,
+ sizeof(drvinfo->fw_version));
strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device),
sizeof(drvinfo->bus_info));
@@ -92,11 +111,57 @@ static int ipoib_set_coalesce(struct net_device *dev,
return 0;
}
+static void ipoib_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats __always_unused *stats,
+ u64 *data)
+{
+ int i;
+ struct net_device_stats *net_stats = &dev->stats;
+ u8 *p = (u8 *)net_stats;
+
+ for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++)
+ data[i] = *(u64 *)(p + ipoib_gstrings_stats[i].stat_offset);
+
+}
+static void ipoib_get_strings(struct net_device __always_unused *dev,
+ u32 stringset, u8 *data)
+{
+ u8 *p = data;
+ int i;
+
+ switch (stringset) {
+ case ETH_SS_STATS:
+ for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) {
+ memcpy(p, ipoib_gstrings_stats[i].stat_string,
+ ETH_GSTRING_LEN);
+ p += ETH_GSTRING_LEN;
+ }
+ break;
+ case ETH_SS_TEST:
+ default:
+ break;
+ }
+}
+static int ipoib_get_sset_count(struct net_device __always_unused *dev,
+ int sset)
+{
+ switch (sset) {
+ case ETH_SS_STATS:
+ return IPOIB_GLOBAL_STATS_LEN;
+ case ETH_SS_TEST:
+ default:
+ break;
+ }
+ return -EOPNOTSUPP;
+}
static const struct ethtool_ops ipoib_ethtool_ops = {
.get_drvinfo = ipoib_get_drvinfo,
.get_coalesce = ipoib_get_coalesce,
.set_coalesce = ipoib_set_coalesce,
+ .get_strings = ipoib_get_strings,
+ .get_ethtool_stats = ipoib_get_ethtool_stats,
+ .get_sset_count = ipoib_get_sset_count,
};
void ipoib_set_ethtool_ops(struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index f0e55e47eb54..be11d5d5b8c1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -51,8 +51,6 @@ MODULE_PARM_DESC(data_debug_level,
"Enable data path debug tracing if > 0");
#endif
-static DEFINE_MUTEX(pkey_mutex);
-
struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
struct ib_pd *pd, struct ib_ah_attr *attr)
{
@@ -637,7 +635,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
if (netif_queue_stopped(dev))
netif_wake_queue(dev);
} else {
- dev->trans_start = jiffies;
+ netif_trans_update(dev);
address->last_send = priv->tx_head;
++priv->tx_head;
@@ -999,6 +997,106 @@ static inline int update_child_pkey(struct ipoib_dev_priv *priv)
return 0;
}
+/*
+ * returns true if the device address of the ipoib interface has changed and the
+ * new address is a valid one (i.e in the gid table), return false otherwise.
+ */
+static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv)
+{
+ union ib_gid search_gid;
+ union ib_gid gid0;
+ union ib_gid *netdev_gid;
+ int err;
+ u16 index;
+ u8 port;
+ bool ret = false;
+
+ netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4);
+ if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL))
+ return false;
+
+ netif_addr_lock_bh(priv->dev);
+
+ /* The subnet prefix may have changed, update it now so we won't have
+ * to do it later
+ */
+ priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix;
+ netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix;
+ search_gid.global.subnet_prefix = gid0.global.subnet_prefix;
+
+ search_gid.global.interface_id = priv->local_gid.global.interface_id;
+
+ netif_addr_unlock_bh(priv->dev);
+
+ err = ib_find_gid(priv->ca, &search_gid, IB_GID_TYPE_IB,
+ priv->dev, &port, &index);
+
+ netif_addr_lock_bh(priv->dev);
+
+ if (search_gid.global.interface_id !=
+ priv->local_gid.global.interface_id)
+ /* There was a change while we were looking up the gid, bail
+ * here and let the next work sort this out
+ */
+ goto out;
+
+ /* The next section of code needs some background:
+ * Per IB spec the port GUID can't change if the HCA is powered on.
+ * port GUID is the basis for GID at index 0 which is the basis for
+ * the default device address of a ipoib interface.
+ *
+ * so it seems the flow should be:
+ * if user_changed_dev_addr && gid in gid tbl
+ * set bit dev_addr_set
+ * return true
+ * else
+ * return false
+ *
+ * The issue is that there are devices that don't follow the spec,
+ * they change the port GUID when the HCA is powered, so in order
+ * not to break userspace applications, We need to check if the
+ * user wanted to control the device address and we assume that
+ * if he sets the device address back to be based on GID index 0,
+ * he no longer wishs to control it.
+ *
+ * If the user doesn't control the the device address,
+ * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means
+ * the port GUID has changed and GID at index 0 has changed
+ * so we need to change priv->local_gid and priv->dev->dev_addr
+ * to reflect the new GID.
+ */
+ if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+ if (!err && port == priv->port) {
+ set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+ if (index == 0)
+ clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL,
+ &priv->flags);
+ else
+ set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags);
+ ret = true;
+ } else {
+ ret = false;
+ }
+ } else {
+ if (!err && port == priv->port) {
+ ret = true;
+ } else {
+ if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) {
+ memcpy(&priv->local_gid, &gid0,
+ sizeof(priv->local_gid));
+ memcpy(priv->dev->dev_addr + 4, &gid0,
+ sizeof(priv->local_gid));
+ ret = true;
+ }
+ }
+ }
+
+out:
+ netif_addr_unlock_bh(priv->dev);
+
+ return ret;
+}
+
static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
enum ipoib_flush_level level,
int nesting)
@@ -1020,6 +1118,9 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
level != IPOIB_FLUSH_HEAVY) {
+ /* Make sure the dev_addr is set even if not flushing */
+ if (level == IPOIB_FLUSH_LIGHT)
+ ipoib_dev_addr_changed_valid(priv);
ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
return;
}
@@ -1031,7 +1132,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
update_parent_pkey(priv);
else
update_child_pkey(priv);
- }
+ } else if (level == IPOIB_FLUSH_LIGHT)
+ ipoib_dev_addr_changed_valid(priv);
ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
return;
}
@@ -1059,8 +1161,17 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
}
if (level == IPOIB_FLUSH_LIGHT) {
+ int oper_up;
ipoib_mark_paths_invalid(dev);
+ /* Set IPoIB operation as down to prevent races between:
+ * the flush flow which leaves MCG and on the fly joins
+ * which can happen during that time. mcast restart task
+ * should deal with join requests we missed.
+ */
+ oper_up = test_and_clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
ipoib_mcast_dev_flush(dev);
+ if (oper_up)
+ set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
ipoib_flush_ah(dev);
}
@@ -1083,7 +1194,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
if (level >= IPOIB_FLUSH_NORMAL)
ipoib_ib_dev_up(dev);
- ipoib_mcast_restart_task(&priv->restart_task);
+ if (ipoib_dev_addr_changed_valid(priv))
+ ipoib_mcast_restart_task(&priv->restart_task);
}
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 80807d6e5c4c..cc1c1b062ea5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -99,6 +99,7 @@ static struct net_device *ipoib_get_net_dev_by_params(
struct ib_device *dev, u8 port, u16 pkey,
const union ib_gid *gid, const struct sockaddr *addr,
void *client_data);
+static int ipoib_set_mac(struct net_device *dev, void *addr);
static struct ib_client ipoib_client = {
.name = "ipoib",
@@ -117,6 +118,8 @@ int ipoib_open(struct net_device *dev)
set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+ priv->sm_fullmember_sendonly_support = false;
+
if (ipoib_ib_dev_open(dev)) {
if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
return 0;
@@ -482,7 +485,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
return -EINVAL;
}
-static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
+struct ipoib_path *__path_find(struct net_device *dev, void *gid)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct rb_node *n = priv->path_tree.rb_node;
@@ -629,6 +632,77 @@ void ipoib_mark_paths_invalid(struct net_device *dev)
spin_unlock_irq(&priv->lock);
}
+struct classport_info_context {
+ struct ipoib_dev_priv *priv;
+ struct completion done;
+ struct ib_sa_query *sa_query;
+};
+
+static void classport_info_query_cb(int status, struct ib_class_port_info *rec,
+ void *context)
+{
+ struct classport_info_context *cb_ctx = context;
+ struct ipoib_dev_priv *priv;
+
+ WARN_ON(!context);
+
+ priv = cb_ctx->priv;
+
+ if (status || !rec) {
+ pr_debug("device: %s failed query classport_info status: %d\n",
+ priv->dev->name, status);
+ /* keeps the default, will try next mcast_restart */
+ priv->sm_fullmember_sendonly_support = false;
+ goto out;
+ }
+
+ if (ib_get_cpi_capmask2(rec) &
+ IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT) {
+ pr_debug("device: %s enabled fullmember-sendonly for sendonly MCG\n",
+ priv->dev->name);
+ priv->sm_fullmember_sendonly_support = true;
+ } else {
+ pr_debug("device: %s disabled fullmember-sendonly for sendonly MCG\n",
+ priv->dev->name);
+ priv->sm_fullmember_sendonly_support = false;
+ }
+
+out:
+ complete(&cb_ctx->done);
+}
+
+int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv)
+{
+ struct classport_info_context *callback_context;
+ int ret;
+
+ callback_context = kmalloc(sizeof(*callback_context), GFP_KERNEL);
+ if (!callback_context)
+ return -ENOMEM;
+
+ callback_context->priv = priv;
+ init_completion(&callback_context->done);
+
+ ret = ib_sa_classport_info_rec_query(&ipoib_sa_client,
+ priv->ca, priv->port, 3000,
+ GFP_KERNEL,
+ classport_info_query_cb,
+ callback_context,
+ &callback_context->sa_query);
+ if (ret < 0) {
+ pr_info("%s failed to send ib_sa_classport_info query, ret: %d\n",
+ priv->dev->name, ret);
+ kfree(callback_context);
+ return ret;
+ }
+
+ /* waiting for the callback to finish before returnning */
+ wait_for_completion(&callback_context->done);
+ kfree(callback_context);
+
+ return ret;
+}
+
void ipoib_flush_paths(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1036,7 +1110,7 @@ static void ipoib_timeout(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
- jiffies_to_msecs(jiffies - dev->trans_start));
+ jiffies_to_msecs(jiffies - dev_trans_start(dev)));
ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
netif_queue_stopped(dev),
priv->tx_head, priv->tx_tail);
@@ -1132,7 +1206,9 @@ struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
neigh = NULL;
goto out_unlock;
}
- neigh->alive = jiffies;
+
+ if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE))
+ neigh->alive = jiffies;
goto out_unlock;
}
}
@@ -1649,6 +1725,7 @@ static const struct net_device_ops ipoib_netdev_ops_pf = {
.ndo_get_vf_config = ipoib_get_vf_config,
.ndo_get_vf_stats = ipoib_get_vf_stats,
.ndo_set_vf_guid = ipoib_set_vf_guid,
+ .ndo_set_mac_address = ipoib_set_mac,
};
static const struct net_device_ops ipoib_netdev_ops_vf = {
@@ -1771,6 +1848,70 @@ int ipoib_add_umcast_attr(struct net_device *dev)
return device_create_file(&dev->dev, &dev_attr_umcast);
}
+static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
+{
+ struct ipoib_dev_priv *child_priv;
+ struct net_device *netdev = priv->dev;
+
+ netif_addr_lock_bh(netdev);
+
+ memcpy(&priv->local_gid.global.interface_id,
+ &gid->global.interface_id,
+ sizeof(gid->global.interface_id));
+ memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
+ clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
+
+ netif_addr_unlock_bh(netdev);
+
+ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+ down_read(&priv->vlan_rwsem);
+ list_for_each_entry(child_priv, &priv->child_intfs, list)
+ set_base_guid(child_priv, gid);
+ up_read(&priv->vlan_rwsem);
+ }
+}
+
+static int ipoib_check_lladdr(struct net_device *dev,
+ struct sockaddr_storage *ss)
+{
+ union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
+ int ret = 0;
+
+ netif_addr_lock_bh(dev);
+
+ /* Make sure the QPN, reserved and subnet prefix match the current
+ * lladdr, it also makes sure the lladdr is unicast.
+ */
+ if (memcmp(dev->dev_addr, ss->__data,
+ 4 + sizeof(gid->global.subnet_prefix)) ||
+ gid->global.interface_id == 0)
+ ret = -EINVAL;
+
+ netif_addr_unlock_bh(dev);
+
+ return ret;
+}
+
+static int ipoib_set_mac(struct net_device *dev, void *addr)
+{
+ struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct sockaddr_storage *ss = addr;
+ int ret;
+
+ if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
+ return -EBUSY;
+
+ ret = ipoib_check_lladdr(dev, ss);
+ if (ret)
+ return ret;
+
+ set_base_guid(priv, (union ib_gid *)(ss->__data + 4));
+
+ queue_work(ipoib_workqueue, &priv->flush_light);
+
+ return 0;
+}
+
static ssize_t create_child(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -1826,8 +1967,7 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
priv->hca_caps = hca->attrs.device_cap_flags;
if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
- priv->dev->hw_features = NETIF_F_SG |
- NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
+ priv->dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
if (priv->hca_caps & IB_DEVICE_UD_TSO)
priv->dev->hw_features |= NETIF_F_TSO;
@@ -1894,6 +2034,7 @@ static struct net_device *ipoib_add_port(const char *format,
goto device_init_failed;
} else
memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+ set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
result = ipoib_dev_init(priv->dev, hca, port);
if (result < 0) {
@@ -2001,6 +2142,9 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data)
ib_unregister_event_handler(&priv->event_handler);
flush_workqueue(ipoib_workqueue);
+ /* mark interface in the middle of destruction */
+ set_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags);
+
rtnl_lock();
dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
rtnl_unlock();
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 25889311b1e9..d3394b6add24 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -64,6 +64,9 @@ struct ipoib_mcast_iter {
unsigned int send_only;
};
+/* join state that allows creating mcg with sendonly member request */
+#define SENDONLY_FULLMEMBER_JOIN 8
+
/*
* This should be called with the priv->lock held
*/
@@ -326,12 +329,23 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work)
struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
carrier_on_task);
struct ib_port_attr attr;
+ int ret;
if (ib_query_port(priv->ca, priv->port, &attr) ||
attr.state != IB_PORT_ACTIVE) {
ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
return;
}
+ /*
+ * Check if can send sendonly MCG's with sendonly-fullmember join state.
+ * It done here after the successfully join to the broadcast group,
+ * because the broadcast group must always be joined first and is always
+ * re-joined if the SM changes substantially.
+ */
+ ret = ipoib_check_sm_sendonly_fullmember_support(priv);
+ if (ret < 0)
+ pr_debug("%s failed query sm support for sendonly-fullmember (ret: %d)\n",
+ priv->dev->name, ret);
/*
* Take rtnl_lock to avoid racing with ipoib_stop() and
@@ -515,22 +529,20 @@ static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
rec.hop_limit = priv->broadcast->mcmember.hop_limit;
/*
- * Send-only IB Multicast joins do not work at the core
- * IB layer yet, so we can't use them here. However,
- * we are emulating an Ethernet multicast send, which
- * does not require a multicast subscription and will
- * still send properly. The most appropriate thing to
+ * Send-only IB Multicast joins work at the core IB layer but
+ * require specific SM support.
+ * We can use such joins here only if the current SM supports that feature.
+ * However, if not, we emulate an Ethernet multicast send,
+ * which does not require a multicast subscription and will
+ * still send properly. The most appropriate thing to
* do is to create the group if it doesn't exist as that
* most closely emulates the behavior, from a user space
- * application perspecitive, of Ethernet multicast
- * operation. For now, we do a full join, maybe later
- * when the core IB layers support send only joins we
- * will use them.
+ * application perspective, of Ethernet multicast operation.
*/
-#if 0
- if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
- rec.join_state = 4;
-#endif
+ if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+ priv->sm_fullmember_sendonly_support)
+ /* SM supports sendonly-fullmember, otherwise fallback to full-member */
+ rec.join_state = SENDONLY_FULLMEMBER_JOIN;
}
spin_unlock_irq(&priv->lock);
@@ -570,11 +582,13 @@ void ipoib_mcast_join_task(struct work_struct *work)
return;
}
priv->local_lid = port_attr.lid;
+ netif_addr_lock_bh(dev);
- if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL))
- ipoib_warn(priv, "ib_query_gid() failed\n");
- else
- memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+ if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+ netif_addr_unlock_bh(dev);
+ return;
+ }
+ netif_addr_unlock_bh(dev);
spin_lock_irq(&priv->lock);
if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index b809c373e40e..c55ecb2c3736 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -135,7 +135,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
.cap = {
.max_send_wr = ipoib_sendq_size,
.max_recv_wr = ipoib_recvq_size,
- .max_send_sge = 1,
+ .max_send_sge = min_t(u32, priv->ca->attrs.max_sge,
+ MAX_SKB_FRAGS + 1),
.max_recv_sge = IPOIB_UD_RX_SG
},
.sq_sig_type = IB_SIGNAL_ALL_WR,
@@ -205,10 +206,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING)
init_attr.create_flags |= IB_QP_CREATE_NETIF_QP;
- if (dev->features & NETIF_F_SG)
- init_attr.cap.max_send_sge =
- min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1);
-
priv->qp = ib_create_qp(priv->pd, &init_attr);
if (IS_ERR(priv->qp)) {
printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
@@ -234,6 +231,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
priv->rx_wr.next = NULL;
priv->rx_wr.sg_list = priv->rx_sge;
+ if (init_attr.cap.max_send_sge > 1)
+ dev->features |= NETIF_F_SG;
+
priv->max_send_sge = init_attr.cap.max_send_sge;
return 0;
@@ -307,5 +307,8 @@ void ipoib_event(struct ib_event_handler *handler,
queue_work(ipoib_workqueue, &priv->flush_normal);
} else if (record->event == IB_EVENT_PKEY_CHANGE) {
queue_work(ipoib_workqueue, &priv->flush_heavy);
+ } else if (record->event == IB_EVENT_GID_CHANGE &&
+ !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) {
+ queue_work(ipoib_workqueue, &priv->flush_light);
}
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index fca1a882de27..a2f9f29c6ab5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -68,6 +68,8 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
priv->pkey = pkey;
memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
+ memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid));
+ set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
priv->dev->broadcast[8] = pkey >> 8;
priv->dev->broadcast[9] = pkey & 0xff;
@@ -129,6 +131,9 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
ppriv = netdev_priv(pdev);
+ if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags))
+ return -EPERM;
+
snprintf(intf_name, sizeof intf_name, "%s.%04x",
ppriv->dev->name, pkey);
priv = ipoib_intf_alloc(intf_name);
@@ -181,6 +186,9 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
ppriv = netdev_priv(pdev);
+ if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags))
+ return -EPERM;
+
if (!rtnl_trylock())
return restart_syscall();
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 9a391cc5b9b3..90be56893414 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -236,7 +236,7 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
page_vec->npages = 0;
page_vec->fake_mr.page_size = SIZE_4K;
plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg,
- mem->size, iser_set_page);
+ mem->size, NULL, iser_set_page);
if (unlikely(plen < mem->size)) {
iser_err("page vec too short to hold this SG\n");
iser_data_buf_dump(mem, device->ib_device);
@@ -446,7 +446,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey));
- n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K);
+ n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K);
if (unlikely(n != mem->size)) {
iser_err("failed to map sg (%d/%d)\n",
n, mem->size);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 411e4464ca23..cae9bbcc27e7 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -33,7 +33,8 @@
#define ISERT_MAX_CONN 8
#define ISER_MAX_RX_CQ_LEN (ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN)
-#define ISER_MAX_TX_CQ_LEN (ISERT_QP_MAX_REQ_DTOS * ISERT_MAX_CONN)
+#define ISER_MAX_TX_CQ_LEN \
+ ((ISERT_QP_MAX_REQ_DTOS + ISCSI_DEF_XMIT_CMDS_MAX) * ISERT_MAX_CONN)
#define ISER_MAX_CQ_LEN (ISER_MAX_RX_CQ_LEN + ISER_MAX_TX_CQ_LEN + \
ISERT_MAX_CONN)
@@ -46,14 +47,6 @@ static LIST_HEAD(device_list);
static struct workqueue_struct *isert_comp_wq;
static struct workqueue_struct *isert_release_wq;
-static void
-isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn);
-static int
-isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn);
-static void
-isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn);
-static int
-isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn);
static int
isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd);
static int
@@ -142,9 +135,8 @@ isert_create_qp(struct isert_conn *isert_conn,
attr.recv_cq = comp->cq;
attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1;
attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
+ attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX;
attr.cap.max_send_sge = device->ib_device->attrs.max_sge;
- isert_conn->max_sge = min(device->ib_device->attrs.max_sge,
- device->ib_device->attrs.max_sge_rd);
attr.cap.max_recv_sge = 1;
attr.sq_sig_type = IB_SIGNAL_REQ_WR;
attr.qp_type = IB_QPT_RC;
@@ -270,9 +262,9 @@ isert_alloc_comps(struct isert_device *device)
device->ib_device->num_comp_vectors));
isert_info("Using %d CQs, %s supports %d vectors support "
- "Fast registration %d pi_capable %d\n",
+ "pi_capable %d\n",
device->comps_used, device->ib_device->name,
- device->ib_device->num_comp_vectors, device->use_fastreg,
+ device->ib_device->num_comp_vectors,
device->pi_capable);
device->comps = kcalloc(device->comps_used, sizeof(struct isert_comp),
@@ -313,18 +305,6 @@ isert_create_device_ib_res(struct isert_device *device)
isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge);
isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd);
- /* asign function handlers */
- if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
- ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
- device->use_fastreg = 1;
- device->reg_rdma_mem = isert_reg_rdma;
- device->unreg_rdma_mem = isert_unreg_rdma;
- } else {
- device->use_fastreg = 0;
- device->reg_rdma_mem = isert_map_rdma;
- device->unreg_rdma_mem = isert_unmap_cmd;
- }
-
ret = isert_alloc_comps(device);
if (ret)
goto out;
@@ -417,156 +397,15 @@ isert_device_get(struct rdma_cm_id *cma_id)
}
static void
-isert_conn_free_fastreg_pool(struct isert_conn *isert_conn)
-{
- struct fast_reg_descriptor *fr_desc, *tmp;
- int i = 0;
-
- if (list_empty(&isert_conn->fr_pool))
- return;
-
- isert_info("Freeing conn %p fastreg pool", isert_conn);
-
- list_for_each_entry_safe(fr_desc, tmp,
- &isert_conn->fr_pool, list) {
- list_del(&fr_desc->list);
- ib_dereg_mr(fr_desc->data_mr);
- if (fr_desc->pi_ctx) {
- ib_dereg_mr(fr_desc->pi_ctx->prot_mr);
- ib_dereg_mr(fr_desc->pi_ctx->sig_mr);
- kfree(fr_desc->pi_ctx);
- }
- kfree(fr_desc);
- ++i;
- }
-
- if (i < isert_conn->fr_pool_size)
- isert_warn("Pool still has %d regions registered\n",
- isert_conn->fr_pool_size - i);
-}
-
-static int
-isert_create_pi_ctx(struct fast_reg_descriptor *desc,
- struct ib_device *device,
- struct ib_pd *pd)
-{
- struct pi_context *pi_ctx;
- int ret;
-
- pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
- if (!pi_ctx) {
- isert_err("Failed to allocate pi context\n");
- return -ENOMEM;
- }
-
- pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
- ISCSI_ISER_SG_TABLESIZE);
- if (IS_ERR(pi_ctx->prot_mr)) {
- isert_err("Failed to allocate prot frmr err=%ld\n",
- PTR_ERR(pi_ctx->prot_mr));
- ret = PTR_ERR(pi_ctx->prot_mr);
- goto err_pi_ctx;
- }
- desc->ind |= ISERT_PROT_KEY_VALID;
-
- pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
- if (IS_ERR(pi_ctx->sig_mr)) {
- isert_err("Failed to allocate signature enabled mr err=%ld\n",
- PTR_ERR(pi_ctx->sig_mr));
- ret = PTR_ERR(pi_ctx->sig_mr);
- goto err_prot_mr;
- }
-
- desc->pi_ctx = pi_ctx;
- desc->ind |= ISERT_SIG_KEY_VALID;
- desc->ind &= ~ISERT_PROTECTED;
-
- return 0;
-
-err_prot_mr:
- ib_dereg_mr(pi_ctx->prot_mr);
-err_pi_ctx:
- kfree(pi_ctx);
-
- return ret;
-}
-
-static int
-isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd,
- struct fast_reg_descriptor *fr_desc)
-{
- fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
- ISCSI_ISER_SG_TABLESIZE);
- if (IS_ERR(fr_desc->data_mr)) {
- isert_err("Failed to allocate data frmr err=%ld\n",
- PTR_ERR(fr_desc->data_mr));
- return PTR_ERR(fr_desc->data_mr);
- }
- fr_desc->ind |= ISERT_DATA_KEY_VALID;
-
- isert_dbg("Created fr_desc %p\n", fr_desc);
-
- return 0;
-}
-
-static int
-isert_conn_create_fastreg_pool(struct isert_conn *isert_conn)
-{
- struct fast_reg_descriptor *fr_desc;
- struct isert_device *device = isert_conn->device;
- struct se_session *se_sess = isert_conn->conn->sess->se_sess;
- struct se_node_acl *se_nacl = se_sess->se_node_acl;
- int i, ret, tag_num;
- /*
- * Setup the number of FRMRs based upon the number of tags
- * available to session in iscsi_target_locate_portal().
- */
- tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth);
- tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS;
-
- isert_conn->fr_pool_size = 0;
- for (i = 0; i < tag_num; i++) {
- fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL);
- if (!fr_desc) {
- isert_err("Failed to allocate fast_reg descriptor\n");
- ret = -ENOMEM;
- goto err;
- }
-
- ret = isert_create_fr_desc(device->ib_device,
- device->pd, fr_desc);
- if (ret) {
- isert_err("Failed to create fastreg descriptor err=%d\n",
- ret);
- kfree(fr_desc);
- goto err;
- }
-
- list_add_tail(&fr_desc->list, &isert_conn->fr_pool);
- isert_conn->fr_pool_size++;
- }
-
- isert_dbg("Creating conn %p fastreg pool size=%d",
- isert_conn, isert_conn->fr_pool_size);
-
- return 0;
-
-err:
- isert_conn_free_fastreg_pool(isert_conn);
- return ret;
-}
-
-static void
isert_init_conn(struct isert_conn *isert_conn)
{
isert_conn->state = ISER_CONN_INIT;
INIT_LIST_HEAD(&isert_conn->node);
init_completion(&isert_conn->login_comp);
init_completion(&isert_conn->login_req_comp);
+ init_waitqueue_head(&isert_conn->rem_wait);
kref_init(&isert_conn->kref);
mutex_init(&isert_conn->mutex);
- spin_lock_init(&isert_conn->pool_lock);
- INIT_LIST_HEAD(&isert_conn->fr_pool);
INIT_WORK(&isert_conn->release_work, isert_release_work);
}
@@ -610,7 +449,7 @@ isert_alloc_login_buf(struct isert_conn *isert_conn,
isert_conn->login_rsp_buf = kzalloc(ISER_RX_PAYLOAD_SIZE, GFP_KERNEL);
if (!isert_conn->login_rsp_buf) {
- isert_err("Unable to allocate isert_conn->login_rspbuf\n");
+ ret = -ENOMEM;
goto out_unmap_login_req_buf;
}
@@ -739,11 +578,9 @@ isert_connect_release(struct isert_conn *isert_conn)
BUG_ON(!device);
- if (device->use_fastreg)
- isert_conn_free_fastreg_pool(isert_conn);
-
isert_free_rx_descriptors(isert_conn);
- if (isert_conn->cm_id)
+ if (isert_conn->cm_id &&
+ !isert_conn->dev_removed)
rdma_destroy_id(isert_conn->cm_id);
if (isert_conn->qp) {
@@ -758,7 +595,10 @@ isert_connect_release(struct isert_conn *isert_conn)
isert_device_put(device);
- kfree(isert_conn);
+ if (isert_conn->dev_removed)
+ wake_up_interruptible(&isert_conn->rem_wait);
+ else
+ kfree(isert_conn);
}
static void
@@ -918,6 +758,7 @@ static int
isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
{
struct isert_np *isert_np = cma_id->context;
+ struct isert_conn *isert_conn;
int ret = 0;
isert_info("%s (%d): status %d id %p np %p\n",
@@ -938,10 +779,21 @@ isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
break;
case RDMA_CM_EVENT_ADDR_CHANGE: /* FALLTHRU */
case RDMA_CM_EVENT_DISCONNECTED: /* FALLTHRU */
- case RDMA_CM_EVENT_DEVICE_REMOVAL: /* FALLTHRU */
case RDMA_CM_EVENT_TIMEWAIT_EXIT: /* FALLTHRU */
ret = isert_disconnected_handler(cma_id, event->event);
break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ isert_conn = cma_id->qp->qp_context;
+ isert_conn->dev_removed = true;
+ isert_disconnected_handler(cma_id, event->event);
+ wait_event_interruptible(isert_conn->rem_wait,
+ isert_conn->state == ISER_CONN_DOWN);
+ kfree(isert_conn);
+ /*
+ * return non-zero from the callback to destroy
+ * the rdma cm id
+ */
+ return 1;
case RDMA_CM_EVENT_REJECTED: /* FALLTHRU */
case RDMA_CM_EVENT_UNREACHABLE: /* FALLTHRU */
case RDMA_CM_EVENT_CONNECT_ERROR:
@@ -1080,7 +932,6 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
{
struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc;
- isert_cmd->iser_ib_op = ISER_IB_SEND;
tx_desc->tx_cqe.done = isert_send_done;
send_wr->wr_cqe = &tx_desc->tx_cqe;
@@ -1160,16 +1011,6 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
}
if (!login->login_failed) {
if (login->login_complete) {
- if (!conn->sess->sess_ops->SessionType &&
- isert_conn->device->use_fastreg) {
- ret = isert_conn_create_fastreg_pool(isert_conn);
- if (ret) {
- isert_err("Conn: %p failed to create"
- " fastreg pool\n", isert_conn);
- return ret;
- }
- }
-
ret = isert_alloc_rx_descriptors(isert_conn);
if (ret)
return ret;
@@ -1633,97 +1474,26 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc)
ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
}
-static int
-isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
- struct scatterlist *sg, u32 nents, u32 length, u32 offset,
- enum iser_ib_op_code op, struct isert_data_buf *data)
-{
- struct ib_device *ib_dev = isert_conn->cm_id->device;
-
- data->dma_dir = op == ISER_IB_RDMA_WRITE ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE;
-
- data->len = length - offset;
- data->offset = offset;
- data->sg_off = data->offset / PAGE_SIZE;
-
- data->sg = &sg[data->sg_off];
- data->nents = min_t(unsigned int, nents - data->sg_off,
- ISCSI_ISER_SG_TABLESIZE);
- data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE *
- PAGE_SIZE);
-
- data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents,
- data->dma_dir);
- if (unlikely(!data->dma_nents)) {
- isert_err("Cmd: unable to dma map SGs %p\n", sg);
- return -EINVAL;
- }
-
- isert_dbg("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n",
- isert_cmd, data->dma_nents, data->sg, data->nents, data->len);
-
- return 0;
-}
-
-static void
-isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data)
-{
- struct ib_device *ib_dev = isert_conn->cm_id->device;
-
- ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir);
- memset(data, 0, sizeof(*data));
-}
-
-
-
-static void
-isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn)
-{
- isert_dbg("Cmd %p\n", isert_cmd);
-
- if (isert_cmd->data.sg) {
- isert_dbg("Cmd %p unmap_sg op\n", isert_cmd);
- isert_unmap_data_buf(isert_conn, &isert_cmd->data);
- }
-
- if (isert_cmd->rdma_wr) {
- isert_dbg("Cmd %p free send_wr\n", isert_cmd);
- kfree(isert_cmd->rdma_wr);
- isert_cmd->rdma_wr = NULL;
- }
-
- if (isert_cmd->ib_sge) {
- isert_dbg("Cmd %p free ib_sge\n", isert_cmd);
- kfree(isert_cmd->ib_sge);
- isert_cmd->ib_sge = NULL;
- }
-}
-
static void
-isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn)
+isert_rdma_rw_ctx_destroy(struct isert_cmd *cmd, struct isert_conn *conn)
{
- isert_dbg("Cmd %p\n", isert_cmd);
+ struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd;
+ enum dma_data_direction dir = target_reverse_dma_direction(se_cmd);
- if (isert_cmd->fr_desc) {
- isert_dbg("Cmd %p free fr_desc %p\n", isert_cmd, isert_cmd->fr_desc);
- if (isert_cmd->fr_desc->ind & ISERT_PROTECTED) {
- isert_unmap_data_buf(isert_conn, &isert_cmd->prot);
- isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED;
- }
- spin_lock_bh(&isert_conn->pool_lock);
- list_add_tail(&isert_cmd->fr_desc->list, &isert_conn->fr_pool);
- spin_unlock_bh(&isert_conn->pool_lock);
- isert_cmd->fr_desc = NULL;
- }
+ if (!cmd->rw.nr_ops)
+ return;
- if (isert_cmd->data.sg) {
- isert_dbg("Cmd %p unmap_sg op\n", isert_cmd);
- isert_unmap_data_buf(isert_conn, &isert_cmd->data);
+ if (isert_prot_cmd(conn, se_cmd)) {
+ rdma_rw_ctx_destroy_signature(&cmd->rw, conn->qp,
+ conn->cm_id->port_num, se_cmd->t_data_sg,
+ se_cmd->t_data_nents, se_cmd->t_prot_sg,
+ se_cmd->t_prot_nents, dir);
+ } else {
+ rdma_rw_ctx_destroy(&cmd->rw, conn->qp, conn->cm_id->port_num,
+ se_cmd->t_data_sg, se_cmd->t_data_nents, dir);
}
- isert_cmd->ib_sge = NULL;
- isert_cmd->rdma_wr = NULL;
+ cmd->rw.nr_ops = 0;
}
static void
@@ -1732,7 +1502,6 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
struct isert_conn *isert_conn = isert_cmd->conn;
struct iscsi_conn *conn = isert_conn->conn;
- struct isert_device *device = isert_conn->device;
struct iscsi_text_rsp *hdr;
isert_dbg("Cmd %p\n", isert_cmd);
@@ -1760,7 +1529,7 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
}
}
- device->unreg_rdma_mem(isert_cmd, isert_conn);
+ isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
transport_generic_free_cmd(&cmd->se_cmd, 0);
break;
case ISCSI_OP_SCSI_TMFUNC:
@@ -1894,14 +1663,9 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
isert_dbg("Cmd %p\n", isert_cmd);
- if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) {
- ret = isert_check_pi_status(cmd,
- isert_cmd->fr_desc->pi_ctx->sig_mr);
- isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED;
- }
+ ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr);
+ isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
- device->unreg_rdma_mem(isert_cmd, isert_conn);
- isert_cmd->rdma_wr_num = 0;
if (ret)
transport_send_check_condition_and_sense(cmd, cmd->pi_err, 0);
else
@@ -1929,16 +1693,12 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
isert_dbg("Cmd %p\n", isert_cmd);
- if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) {
- ret = isert_check_pi_status(se_cmd,
- isert_cmd->fr_desc->pi_ctx->sig_mr);
- isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED;
- }
-
iscsit_stop_dataout_timer(cmd);
- device->unreg_rdma_mem(isert_cmd, isert_conn);
- cmd->write_data_done = isert_cmd->data.len;
- isert_cmd->rdma_wr_num = 0;
+
+ if (isert_prot_cmd(isert_conn, se_cmd))
+ ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr);
+ isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
+ cmd->write_data_done = 0;
isert_dbg("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd);
spin_lock_bh(&cmd->istate_lock);
@@ -2111,7 +1871,6 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
{
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
struct isert_conn *isert_conn = conn->context;
- struct isert_device *device = isert_conn->device;
spin_lock_bh(&conn->cmd_lock);
if (!list_empty(&cmd->i_conn_node))
@@ -2120,8 +1879,7 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
if (cmd->data_direction == DMA_TO_DEVICE)
iscsit_stop_dataout_timer(cmd);
-
- device->unreg_rdma_mem(isert_cmd, isert_conn);
+ isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
}
static enum target_prot_op
@@ -2274,234 +2032,6 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
return isert_post_response(isert_conn, isert_cmd);
}
-static int
-isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
- struct ib_sge *ib_sge, struct ib_rdma_wr *rdma_wr,
- u32 data_left, u32 offset)
-{
- struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
- struct scatterlist *sg_start, *tmp_sg;
- struct isert_device *device = isert_conn->device;
- struct ib_device *ib_dev = device->ib_device;
- u32 sg_off, page_off;
- int i = 0, sg_nents;
-
- sg_off = offset / PAGE_SIZE;
- sg_start = &cmd->se_cmd.t_data_sg[sg_off];
- sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge);
- page_off = offset % PAGE_SIZE;
-
- rdma_wr->wr.sg_list = ib_sge;
- rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe;
-
- /*
- * Perform mapping of TCM scatterlist memory ib_sge dma_addr.
- */
- for_each_sg(sg_start, tmp_sg, sg_nents, i) {
- isert_dbg("RDMA from SGL dma_addr: 0x%llx dma_len: %u, "
- "page_off: %u\n",
- (unsigned long long)tmp_sg->dma_address,
- tmp_sg->length, page_off);
-
- ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off;
- ib_sge->length = min_t(u32, data_left,
- ib_sg_dma_len(ib_dev, tmp_sg) - page_off);
- ib_sge->lkey = device->pd->local_dma_lkey;
-
- isert_dbg("RDMA ib_sge: addr: 0x%llx length: %u lkey: %x\n",
- ib_sge->addr, ib_sge->length, ib_sge->lkey);
- page_off = 0;
- data_left -= ib_sge->length;
- if (!data_left)
- break;
- ib_sge++;
- isert_dbg("Incrementing ib_sge pointer to %p\n", ib_sge);
- }
-
- rdma_wr->wr.num_sge = ++i;
- isert_dbg("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n",
- rdma_wr->wr.sg_list, rdma_wr->wr.num_sge);
-
- return rdma_wr->wr.num_sge;
-}
-
-static int
-isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn)
-{
- struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
- struct se_cmd *se_cmd = &cmd->se_cmd;
- struct isert_conn *isert_conn = conn->context;
- struct isert_data_buf *data = &isert_cmd->data;
- struct ib_rdma_wr *rdma_wr;
- struct ib_sge *ib_sge;
- u32 offset, data_len, data_left, rdma_write_max, va_offset = 0;
- int ret = 0, i, ib_sge_cnt;
-
- offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ?
- cmd->write_data_done : 0;
- ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg,
- se_cmd->t_data_nents, se_cmd->data_length,
- offset, isert_cmd->iser_ib_op,
- &isert_cmd->data);
- if (ret)
- return ret;
-
- data_left = data->len;
- offset = data->offset;
-
- ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL);
- if (!ib_sge) {
- isert_warn("Unable to allocate ib_sge\n");
- ret = -ENOMEM;
- goto unmap_cmd;
- }
- isert_cmd->ib_sge = ib_sge;
-
- isert_cmd->rdma_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge);
- isert_cmd->rdma_wr = kzalloc(sizeof(struct ib_rdma_wr) *
- isert_cmd->rdma_wr_num, GFP_KERNEL);
- if (!isert_cmd->rdma_wr) {
- isert_dbg("Unable to allocate isert_cmd->rdma_wr\n");
- ret = -ENOMEM;
- goto unmap_cmd;
- }
-
- rdma_write_max = isert_conn->max_sge * PAGE_SIZE;
-
- for (i = 0; i < isert_cmd->rdma_wr_num; i++) {
- rdma_wr = &isert_cmd->rdma_wr[i];
- data_len = min(data_left, rdma_write_max);
-
- rdma_wr->wr.send_flags = 0;
- if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) {
- isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done;
-
- rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
- rdma_wr->remote_addr = isert_cmd->read_va + offset;
- rdma_wr->rkey = isert_cmd->read_stag;
- if (i + 1 == isert_cmd->rdma_wr_num)
- rdma_wr->wr.next = &isert_cmd->tx_desc.send_wr;
- else
- rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr;
- } else {
- isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done;
-
- rdma_wr->wr.opcode = IB_WR_RDMA_READ;
- rdma_wr->remote_addr = isert_cmd->write_va + va_offset;
- rdma_wr->rkey = isert_cmd->write_stag;
- if (i + 1 == isert_cmd->rdma_wr_num)
- rdma_wr->wr.send_flags = IB_SEND_SIGNALED;
- else
- rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr;
- }
-
- ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge,
- rdma_wr, data_len, offset);
- ib_sge += ib_sge_cnt;
-
- offset += data_len;
- va_offset += data_len;
- data_left -= data_len;
- }
-
- return 0;
-unmap_cmd:
- isert_unmap_data_buf(isert_conn, data);
-
- return ret;
-}
-
-static inline void
-isert_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
-{
- u32 rkey;
-
- memset(inv_wr, 0, sizeof(*inv_wr));
- inv_wr->wr_cqe = NULL;
- inv_wr->opcode = IB_WR_LOCAL_INV;
- inv_wr->ex.invalidate_rkey = mr->rkey;
-
- /* Bump the key */
- rkey = ib_inc_rkey(mr->rkey);
- ib_update_fast_reg_key(mr, rkey);
-}
-
-static int
-isert_fast_reg_mr(struct isert_conn *isert_conn,
- struct fast_reg_descriptor *fr_desc,
- struct isert_data_buf *mem,
- enum isert_indicator ind,
- struct ib_sge *sge)
-{
- struct isert_device *device = isert_conn->device;
- struct ib_device *ib_dev = device->ib_device;
- struct ib_mr *mr;
- struct ib_reg_wr reg_wr;
- struct ib_send_wr inv_wr, *bad_wr, *wr = NULL;
- int ret, n;
-
- if (mem->dma_nents == 1) {
- sge->lkey = device->pd->local_dma_lkey;
- sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]);
- sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]);
- isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n",
- sge->addr, sge->length, sge->lkey);
- return 0;
- }
-
- if (ind == ISERT_DATA_KEY_VALID)
- /* Registering data buffer */
- mr = fr_desc->data_mr;
- else
- /* Registering protection buffer */
- mr = fr_desc->pi_ctx->prot_mr;
-
- if (!(fr_desc->ind & ind)) {
- isert_inv_rkey(&inv_wr, mr);
- wr = &inv_wr;
- }
-
- n = ib_map_mr_sg(mr, mem->sg, mem->nents, PAGE_SIZE);
- if (unlikely(n != mem->nents)) {
- isert_err("failed to map mr sg (%d/%d)\n",
- n, mem->nents);
- return n < 0 ? n : -EINVAL;
- }
-
- isert_dbg("Use fr_desc %p sg_nents %d offset %u\n",
- fr_desc, mem->nents, mem->offset);
-
- reg_wr.wr.next = NULL;
- reg_wr.wr.opcode = IB_WR_REG_MR;
- reg_wr.wr.wr_cqe = NULL;
- reg_wr.wr.send_flags = 0;
- reg_wr.wr.num_sge = 0;
- reg_wr.mr = mr;
- reg_wr.key = mr->lkey;
- reg_wr.access = IB_ACCESS_LOCAL_WRITE;
-
- if (!wr)
- wr = &reg_wr.wr;
- else
- wr->next = &reg_wr.wr;
-
- ret = ib_post_send(isert_conn->qp, wr, &bad_wr);
- if (ret) {
- isert_err("fast registration failed, ret:%d\n", ret);
- return ret;
- }
- fr_desc->ind &= ~ind;
-
- sge->lkey = mr->lkey;
- sge->addr = mr->iova;
- sge->length = mr->length;
-
- isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n",
- sge->addr, sge->length, sge->lkey);
-
- return ret;
-}
-
static inline void
isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs,
struct ib_sig_domain *domain)
@@ -2526,6 +2056,8 @@ isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs,
static int
isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs)
{
+ memset(sig_attrs, 0, sizeof(*sig_attrs));
+
switch (se_cmd->prot_op) {
case TARGET_PROT_DIN_INSERT:
case TARGET_PROT_DOUT_STRIP:
@@ -2547,228 +2079,59 @@ isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs)
return -EINVAL;
}
+ sig_attrs->check_mask =
+ (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) |
+ (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) |
+ (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0);
return 0;
}
-static inline u8
-isert_set_prot_checks(u8 prot_checks)
-{
- return (prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) |
- (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) |
- (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0);
-}
-
-static int
-isert_reg_sig_mr(struct isert_conn *isert_conn,
- struct isert_cmd *isert_cmd,
- struct fast_reg_descriptor *fr_desc)
-{
- struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd;
- struct ib_sig_handover_wr sig_wr;
- struct ib_send_wr inv_wr, *bad_wr, *wr = NULL;
- struct pi_context *pi_ctx = fr_desc->pi_ctx;
- struct ib_sig_attrs sig_attrs;
- int ret;
-
- memset(&sig_attrs, 0, sizeof(sig_attrs));
- ret = isert_set_sig_attrs(se_cmd, &sig_attrs);
- if (ret)
- goto err;
-
- sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks);
-
- if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) {
- isert_inv_rkey(&inv_wr, pi_ctx->sig_mr);
- wr = &inv_wr;
- }
-
- memset(&sig_wr, 0, sizeof(sig_wr));
- sig_wr.wr.opcode = IB_WR_REG_SIG_MR;
- sig_wr.wr.wr_cqe = NULL;
- sig_wr.wr.sg_list = &isert_cmd->ib_sg[DATA];
- sig_wr.wr.num_sge = 1;
- sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE;
- sig_wr.sig_attrs = &sig_attrs;
- sig_wr.sig_mr = pi_ctx->sig_mr;
- if (se_cmd->t_prot_sg)
- sig_wr.prot = &isert_cmd->ib_sg[PROT];
-
- if (!wr)
- wr = &sig_wr.wr;
- else
- wr->next = &sig_wr.wr;
-
- ret = ib_post_send(isert_conn->qp, wr, &bad_wr);
- if (ret) {
- isert_err("fast registration failed, ret:%d\n", ret);
- goto err;
- }
- fr_desc->ind &= ~ISERT_SIG_KEY_VALID;
-
- isert_cmd->ib_sg[SIG].lkey = pi_ctx->sig_mr->lkey;
- isert_cmd->ib_sg[SIG].addr = 0;
- isert_cmd->ib_sg[SIG].length = se_cmd->data_length;
- if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP &&
- se_cmd->prot_op != TARGET_PROT_DOUT_INSERT)
- /*
- * We have protection guards on the wire
- * so we need to set a larget transfer
- */
- isert_cmd->ib_sg[SIG].length += se_cmd->prot_length;
-
- isert_dbg("sig_sge: addr: 0x%llx length: %u lkey: %x\n",
- isert_cmd->ib_sg[SIG].addr, isert_cmd->ib_sg[SIG].length,
- isert_cmd->ib_sg[SIG].lkey);
-err:
- return ret;
-}
-
static int
-isert_handle_prot_cmd(struct isert_conn *isert_conn,
- struct isert_cmd *isert_cmd)
-{
- struct isert_device *device = isert_conn->device;
- struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd;
+isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn,
+ struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+ struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd;
+ enum dma_data_direction dir = target_reverse_dma_direction(se_cmd);
+ u8 port_num = conn->cm_id->port_num;
+ u64 addr;
+ u32 rkey, offset;
int ret;
- if (!isert_cmd->fr_desc->pi_ctx) {
- ret = isert_create_pi_ctx(isert_cmd->fr_desc,
- device->ib_device,
- device->pd);
- if (ret) {
- isert_err("conn %p failed to allocate pi_ctx\n",
- isert_conn);
- return ret;
- }
- }
-
- if (se_cmd->t_prot_sg) {
- ret = isert_map_data_buf(isert_conn, isert_cmd,
- se_cmd->t_prot_sg,
- se_cmd->t_prot_nents,
- se_cmd->prot_length,
- 0,
- isert_cmd->iser_ib_op,
- &isert_cmd->prot);
- if (ret) {
- isert_err("conn %p failed to map protection buffer\n",
- isert_conn);
- return ret;
- }
-
- memset(&isert_cmd->ib_sg[PROT], 0, sizeof(isert_cmd->ib_sg[PROT]));
- ret = isert_fast_reg_mr(isert_conn, isert_cmd->fr_desc,
- &isert_cmd->prot,
- ISERT_PROT_KEY_VALID,
- &isert_cmd->ib_sg[PROT]);
- if (ret) {
- isert_err("conn %p failed to fast reg mr\n",
- isert_conn);
- goto unmap_prot_cmd;
- }
- }
-
- ret = isert_reg_sig_mr(isert_conn, isert_cmd, isert_cmd->fr_desc);
- if (ret) {
- isert_err("conn %p failed to fast reg mr\n",
- isert_conn);
- goto unmap_prot_cmd;
- }
- isert_cmd->fr_desc->ind |= ISERT_PROTECTED;
-
- return 0;
-
-unmap_prot_cmd:
- if (se_cmd->t_prot_sg)
- isert_unmap_data_buf(isert_conn, &isert_cmd->prot);
-
- return ret;
-}
-
-static int
-isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn)
-{
- struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
- struct se_cmd *se_cmd = &cmd->se_cmd;
- struct isert_conn *isert_conn = conn->context;
- struct fast_reg_descriptor *fr_desc = NULL;
- struct ib_rdma_wr *rdma_wr;
- struct ib_sge *ib_sg;
- u32 offset;
- int ret = 0;
- unsigned long flags;
-
- offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ?
- cmd->write_data_done : 0;
- ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg,
- se_cmd->t_data_nents, se_cmd->data_length,
- offset, isert_cmd->iser_ib_op,
- &isert_cmd->data);
- if (ret)
- return ret;
-
- if (isert_cmd->data.dma_nents != 1 ||
- isert_prot_cmd(isert_conn, se_cmd)) {
- spin_lock_irqsave(&isert_conn->pool_lock, flags);
- fr_desc = list_first_entry(&isert_conn->fr_pool,
- struct fast_reg_descriptor, list);
- list_del(&fr_desc->list);
- spin_unlock_irqrestore(&isert_conn->pool_lock, flags);
- isert_cmd->fr_desc = fr_desc;
- }
-
- ret = isert_fast_reg_mr(isert_conn, fr_desc, &isert_cmd->data,
- ISERT_DATA_KEY_VALID, &isert_cmd->ib_sg[DATA]);
- if (ret)
- goto unmap_cmd;
-
- if (isert_prot_cmd(isert_conn, se_cmd)) {
- ret = isert_handle_prot_cmd(isert_conn, isert_cmd);
- if (ret)
- goto unmap_cmd;
-
- ib_sg = &isert_cmd->ib_sg[SIG];
+ if (dir == DMA_FROM_DEVICE) {
+ addr = cmd->write_va;
+ rkey = cmd->write_stag;
+ offset = cmd->iscsi_cmd->write_data_done;
} else {
- ib_sg = &isert_cmd->ib_sg[DATA];
+ addr = cmd->read_va;
+ rkey = cmd->read_stag;
+ offset = 0;
}
- memcpy(&isert_cmd->s_ib_sge, ib_sg, sizeof(*ib_sg));
- isert_cmd->ib_sge = &isert_cmd->s_ib_sge;
- isert_cmd->rdma_wr_num = 1;
- memset(&isert_cmd->s_rdma_wr, 0, sizeof(isert_cmd->s_rdma_wr));
- isert_cmd->rdma_wr = &isert_cmd->s_rdma_wr;
+ if (isert_prot_cmd(conn, se_cmd)) {
+ struct ib_sig_attrs sig_attrs;
- rdma_wr = &isert_cmd->s_rdma_wr;
- rdma_wr->wr.sg_list = &isert_cmd->s_ib_sge;
- rdma_wr->wr.num_sge = 1;
- rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe;
- if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) {
- isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done;
+ ret = isert_set_sig_attrs(se_cmd, &sig_attrs);
+ if (ret)
+ return ret;
- rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
- rdma_wr->remote_addr = isert_cmd->read_va;
- rdma_wr->rkey = isert_cmd->read_stag;
- rdma_wr->wr.send_flags = !isert_prot_cmd(isert_conn, se_cmd) ?
- 0 : IB_SEND_SIGNALED;
+ WARN_ON_ONCE(offset);
+ ret = rdma_rw_ctx_signature_init(&cmd->rw, conn->qp, port_num,
+ se_cmd->t_data_sg, se_cmd->t_data_nents,
+ se_cmd->t_prot_sg, se_cmd->t_prot_nents,
+ &sig_attrs, addr, rkey, dir);
} else {
- isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done;
-
- rdma_wr->wr.opcode = IB_WR_RDMA_READ;
- rdma_wr->remote_addr = isert_cmd->write_va;
- rdma_wr->rkey = isert_cmd->write_stag;
- rdma_wr->wr.send_flags = IB_SEND_SIGNALED;
+ ret = rdma_rw_ctx_init(&cmd->rw, conn->qp, port_num,
+ se_cmd->t_data_sg, se_cmd->t_data_nents,
+ offset, addr, rkey, dir);
}
-
- return 0;
-
-unmap_cmd:
- if (fr_desc) {
- spin_lock_irqsave(&isert_conn->pool_lock, flags);
- list_add_tail(&fr_desc->list, &isert_conn->fr_pool);
- spin_unlock_irqrestore(&isert_conn->pool_lock, flags);
+ if (ret < 0) {
+ isert_err("Cmd: %p failed to prepare RDMA res\n", cmd);
+ return ret;
}
- isert_unmap_data_buf(isert_conn, &isert_cmd->data);
+ ret = rdma_rw_ctx_post(&cmd->rw, conn->qp, port_num, cqe, chain_wr);
+ if (ret < 0)
+ isert_err("Cmd: %p failed to post RDMA res\n", cmd);
return ret;
}
@@ -2778,21 +2141,17 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
struct se_cmd *se_cmd = &cmd->se_cmd;
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
struct isert_conn *isert_conn = conn->context;
- struct isert_device *device = isert_conn->device;
- struct ib_send_wr *wr_failed;
+ struct ib_cqe *cqe = NULL;
+ struct ib_send_wr *chain_wr = NULL;
int rc;
isert_dbg("Cmd: %p RDMA_WRITE data_length: %u\n",
isert_cmd, se_cmd->data_length);
- isert_cmd->iser_ib_op = ISER_IB_RDMA_WRITE;
- rc = device->reg_rdma_mem(isert_cmd, conn);
- if (rc) {
- isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd);
- return rc;
- }
-
- if (!isert_prot_cmd(isert_conn, se_cmd)) {
+ if (isert_prot_cmd(isert_conn, se_cmd)) {
+ isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done;
+ cqe = &isert_cmd->tx_desc.tx_cqe;
+ } else {
/*
* Build isert_conn->tx_desc for iSCSI response PDU and attach
*/
@@ -2803,56 +2162,35 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
isert_init_send_wr(isert_conn, isert_cmd,
&isert_cmd->tx_desc.send_wr);
- isert_cmd->s_rdma_wr.wr.next = &isert_cmd->tx_desc.send_wr;
- isert_cmd->rdma_wr_num += 1;
rc = isert_post_recv(isert_conn, isert_cmd->rx_desc);
if (rc) {
isert_err("ib_post_recv failed with %d\n", rc);
return rc;
}
- }
-
- rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed);
- if (rc)
- isert_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n");
- if (!isert_prot_cmd(isert_conn, se_cmd))
- isert_dbg("Cmd: %p posted RDMA_WRITE + Response for iSER Data "
- "READ\n", isert_cmd);
- else
- isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n",
- isert_cmd);
+ chain_wr = &isert_cmd->tx_desc.send_wr;
+ }
+ isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr);
+ isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", isert_cmd);
return 1;
}
static int
isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery)
{
- struct se_cmd *se_cmd = &cmd->se_cmd;
struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
- struct isert_conn *isert_conn = conn->context;
- struct isert_device *device = isert_conn->device;
- struct ib_send_wr *wr_failed;
- int rc;
isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n",
- isert_cmd, se_cmd->data_length, cmd->write_data_done);
- isert_cmd->iser_ib_op = ISER_IB_RDMA_READ;
- rc = device->reg_rdma_mem(isert_cmd, conn);
- if (rc) {
- isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd);
- return rc;
- }
+ isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done);
- rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed);
- if (rc)
- isert_warn("ib_post_send() failed for IB_WR_RDMA_READ\n");
+ isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done;
+ isert_rdma_rw_ctx_post(isert_cmd, conn->context,
+ &isert_cmd->tx_desc.tx_cqe, NULL);
isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n",
isert_cmd);
-
return 0;
}
@@ -3273,9 +2611,19 @@ static void isert_free_conn(struct iscsi_conn *conn)
isert_put_conn(isert_conn);
}
+static void isert_get_rx_pdu(struct iscsi_conn *conn)
+{
+ struct completion comp;
+
+ init_completion(&comp);
+
+ wait_for_completion_interruptible(&comp);
+}
+
static struct iscsit_transport iser_target_transport = {
.name = "IB/iSER",
.transport_type = ISCSI_INFINIBAND,
+ .rdma_shutdown = true,
.priv_size = sizeof(struct isert_cmd),
.owner = THIS_MODULE,
.iscsit_setup_np = isert_setup_np,
@@ -3291,6 +2639,7 @@ static struct iscsit_transport iser_target_transport = {
.iscsit_queue_data_in = isert_put_datain,
.iscsit_queue_status = isert_put_response,
.iscsit_aborted_task = isert_aborted_task,
+ .iscsit_get_rx_pdu = isert_get_rx_pdu,
.iscsit_get_sup_prot_ops = isert_get_sup_prot_ops,
};
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
index 147900cbb578..c02ada57d7f5 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -3,6 +3,7 @@
#include <linux/in6.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
+#include <rdma/rw.h>
#include <scsi/iser.h>
@@ -53,10 +54,7 @@
#define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2)
-#define ISERT_INFLIGHT_DATAOUTS 8
-
-#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \
- (1 + ISERT_INFLIGHT_DATAOUTS) + \
+#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX + \
ISERT_MAX_TX_MISC_PDUS + \
ISERT_MAX_RX_MISC_PDUS)
@@ -71,13 +69,6 @@ enum isert_desc_type {
ISCSI_TX_DATAIN
};
-enum iser_ib_op_code {
- ISER_IB_RECV,
- ISER_IB_SEND,
- ISER_IB_RDMA_WRITE,
- ISER_IB_RDMA_READ,
-};
-
enum iser_conn_state {
ISER_CONN_INIT,
ISER_CONN_UP,
@@ -118,42 +109,6 @@ static inline struct iser_tx_desc *cqe_to_tx_desc(struct ib_cqe *cqe)
return container_of(cqe, struct iser_tx_desc, tx_cqe);
}
-
-enum isert_indicator {
- ISERT_PROTECTED = 1 << 0,
- ISERT_DATA_KEY_VALID = 1 << 1,
- ISERT_PROT_KEY_VALID = 1 << 2,
- ISERT_SIG_KEY_VALID = 1 << 3,
-};
-
-struct pi_context {
- struct ib_mr *prot_mr;
- struct ib_mr *sig_mr;
-};
-
-struct fast_reg_descriptor {
- struct list_head list;
- struct ib_mr *data_mr;
- u8 ind;
- struct pi_context *pi_ctx;
-};
-
-struct isert_data_buf {
- struct scatterlist *sg;
- int nents;
- u32 sg_off;
- u32 len; /* cur_rdma_length */
- u32 offset;
- unsigned int dma_nents;
- enum dma_data_direction dma_dir;
-};
-
-enum {
- DATA = 0,
- PROT = 1,
- SIG = 2,
-};
-
struct isert_cmd {
uint32_t read_stag;
uint32_t write_stag;
@@ -166,16 +121,7 @@ struct isert_cmd {
struct iscsi_cmd *iscsi_cmd;
struct iser_tx_desc tx_desc;
struct iser_rx_desc *rx_desc;
- enum iser_ib_op_code iser_ib_op;
- struct ib_sge *ib_sge;
- struct ib_sge s_ib_sge;
- int rdma_wr_num;
- struct ib_rdma_wr *rdma_wr;
- struct ib_rdma_wr s_rdma_wr;
- struct ib_sge ib_sg[3];
- struct isert_data_buf data;
- struct isert_data_buf prot;
- struct fast_reg_descriptor *fr_desc;
+ struct rdma_rw_ctx rw;
struct work_struct comp_work;
struct scatterlist sg;
};
@@ -192,7 +138,6 @@ struct isert_conn {
u32 responder_resources;
u32 initiator_depth;
bool pi_support;
- u32 max_sge;
struct iser_rx_desc *login_req_buf;
char *login_rsp_buf;
u64 login_req_dma;
@@ -210,13 +155,11 @@ struct isert_conn {
struct isert_device *device;
struct mutex mutex;
struct kref kref;
- struct list_head fr_pool;
- int fr_pool_size;
- /* lock to protect fastreg pool */
- spinlock_t pool_lock;
struct work_struct release_work;
bool logout_posted;
bool snd_w_inv;
+ wait_queue_head_t rem_wait;
+ bool dev_removed;
};
#define ISERT_MAX_CQ 64
@@ -236,7 +179,6 @@ struct isert_comp {
};
struct isert_device {
- int use_fastreg;
bool pi_capable;
int refcount;
struct ib_device *ib_device;
@@ -244,10 +186,6 @@ struct isert_device {
struct isert_comp *comps;
int comps_used;
struct list_head dev_node;
- int (*reg_rdma_mem)(struct isert_cmd *isert_cmd,
- struct iscsi_conn *conn);
- void (*unreg_rdma_mem)(struct isert_cmd *isert_cmd,
- struct isert_conn *isert_conn);
};
struct isert_np {
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index b6bf20496021..3322ed750172 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -70,6 +70,7 @@ static unsigned int indirect_sg_entries;
static bool allow_ext_sg;
static bool prefer_fr = true;
static bool register_always = true;
+static bool never_register;
static int topspin_workarounds = 1;
module_param(srp_sg_tablesize, uint, 0444);
@@ -81,7 +82,7 @@ MODULE_PARM_DESC(cmd_sg_entries,
module_param(indirect_sg_entries, uint, 0444);
MODULE_PARM_DESC(indirect_sg_entries,
- "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")");
+ "Default max number of gather/scatter entries (default is 12, max is " __stringify(SG_MAX_SEGMENTS) ")");
module_param(allow_ext_sg, bool, 0444);
MODULE_PARM_DESC(allow_ext_sg,
@@ -99,6 +100,9 @@ module_param(register_always, bool, 0444);
MODULE_PARM_DESC(register_always,
"Use memory registration even for contiguous memory regions");
+module_param(never_register, bool, 0444);
+MODULE_PARM_DESC(never_register, "Never register memory");
+
static const struct kernel_param_ops srp_tmo_ops;
static int srp_reconnect_delay = 10;
@@ -316,7 +320,7 @@ static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target)
struct ib_fmr_pool_param fmr_param;
memset(&fmr_param, 0, sizeof(fmr_param));
- fmr_param.pool_size = target->scsi_host->can_queue;
+ fmr_param.pool_size = target->mr_pool_size;
fmr_param.dirty_watermark = fmr_param.pool_size / 4;
fmr_param.cache = 1;
fmr_param.max_pages_per_fmr = dev->max_pages_per_mr;
@@ -441,23 +445,22 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
{
struct srp_device *dev = target->srp_host->srp_dev;
- return srp_create_fr_pool(dev->dev, dev->pd,
- target->scsi_host->can_queue,
+ return srp_create_fr_pool(dev->dev, dev->pd, target->mr_pool_size,
dev->max_pages_per_mr);
}
/**
* srp_destroy_qp() - destroy an RDMA queue pair
- * @ch: SRP RDMA channel.
+ * @qp: RDMA queue pair.
*
* Drain the qp before destroying it. This avoids that the receive
* completion handler can access the queue pair while it is
* being destroyed.
*/
-static void srp_destroy_qp(struct srp_rdma_ch *ch)
+static void srp_destroy_qp(struct ib_qp *qp)
{
- ib_drain_rq(ch->qp);
- ib_destroy_qp(ch->qp);
+ ib_drain_rq(qp);
+ ib_destroy_qp(qp);
}
static int srp_create_ch_ib(struct srp_rdma_ch *ch)
@@ -469,7 +472,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
struct ib_qp *qp;
struct ib_fmr_pool *fmr_pool = NULL;
struct srp_fr_pool *fr_pool = NULL;
- const int m = dev->use_fast_reg ? 3 : 1;
+ const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2;
int ret;
init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
@@ -530,7 +533,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
}
if (ch->qp)
- srp_destroy_qp(ch);
+ srp_destroy_qp(ch->qp);
if (ch->recv_cq)
ib_free_cq(ch->recv_cq);
if (ch->send_cq)
@@ -554,7 +557,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
return 0;
err_qp:
- srp_destroy_qp(ch);
+ srp_destroy_qp(qp);
err_send_cq:
ib_free_cq(send_cq);
@@ -597,7 +600,7 @@ static void srp_free_ch_ib(struct srp_target_port *target,
ib_destroy_fmr_pool(ch->fmr_pool);
}
- srp_destroy_qp(ch);
+ srp_destroy_qp(ch->qp);
ib_free_cq(ch->send_cq);
ib_free_cq(ch->recv_cq);
@@ -850,7 +853,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch)
for (i = 0; i < target->req_ring_size; ++i) {
req = &ch->req_ring[i];
- mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *),
+ mr_list = kmalloc(target->mr_per_cmd * sizeof(void *),
GFP_KERNEL);
if (!mr_list)
goto out;
@@ -1112,7 +1115,7 @@ static struct scsi_cmnd *srp_claim_req(struct srp_rdma_ch *ch,
}
/**
- * srp_free_req() - Unmap data and add request to the free request list.
+ * srp_free_req() - Unmap data and adjust ch->req_lim.
* @ch: SRP RDMA channel.
* @req: Request to be freed.
* @scmnd: SCSI command associated with @req.
@@ -1299,9 +1302,16 @@ static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc)
srp_handle_qp_err(cq, wc, "FAST REG");
}
+/*
+ * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset
+ * where to start in the first element. If sg_offset_p != NULL then
+ * *sg_offset_p is updated to the offset in state->sg[retval] of the first
+ * byte that has not yet been mapped.
+ */
static int srp_map_finish_fr(struct srp_map_state *state,
struct srp_request *req,
- struct srp_rdma_ch *ch, int sg_nents)
+ struct srp_rdma_ch *ch, int sg_nents,
+ unsigned int *sg_offset_p)
{
struct srp_target_port *target = ch->target;
struct srp_device *dev = target->srp_host->srp_dev;
@@ -1316,13 +1326,14 @@ static int srp_map_finish_fr(struct srp_map_state *state,
WARN_ON_ONCE(!dev->use_fast_reg);
- if (sg_nents == 0)
- return 0;
-
if (sg_nents == 1 && target->global_mr) {
- srp_map_desc(state, sg_dma_address(state->sg),
- sg_dma_len(state->sg),
+ unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+
+ srp_map_desc(state, sg_dma_address(state->sg) + sg_offset,
+ sg_dma_len(state->sg) - sg_offset,
target->global_mr->rkey);
+ if (sg_offset_p)
+ *sg_offset_p = 0;
return 1;
}
@@ -1333,9 +1344,17 @@ static int srp_map_finish_fr(struct srp_map_state *state,
rkey = ib_inc_rkey(desc->mr->rkey);
ib_update_fast_reg_key(desc->mr, rkey);
- n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, dev->mr_page_size);
- if (unlikely(n < 0))
+ n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p,
+ dev->mr_page_size);
+ if (unlikely(n < 0)) {
+ srp_fr_pool_put(ch->fr_pool, &desc, 1);
+ pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n",
+ dev_name(&req->scmnd->device->sdev_gendev), sg_nents,
+ sg_offset_p ? *sg_offset_p : -1, n);
return n;
+ }
+
+ WARN_ON_ONCE(desc->mr->length == 0);
req->reg_cqe.done = srp_reg_mr_err_done;
@@ -1357,8 +1376,10 @@ static int srp_map_finish_fr(struct srp_map_state *state,
desc->mr->length, desc->mr->rkey);
err = ib_post_send(ch->qp, &wr.wr, &bad_wr);
- if (unlikely(err))
+ if (unlikely(err)) {
+ WARN_ON_ONCE(err == -ENOMEM);
return err;
+ }
return n;
}
@@ -1398,7 +1419,7 @@ static int srp_map_sg_entry(struct srp_map_state *state,
/*
* If the last entry of the MR wasn't a full page, then we need to
* close it out and start a new one -- we can only merge at page
- * boundries.
+ * boundaries.
*/
ret = 0;
if (len != dev->mr_page_size)
@@ -1413,10 +1434,9 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch,
struct scatterlist *sg;
int i, ret;
- state->desc = req->indirect_desc;
state->pages = req->map_page;
state->fmr.next = req->fmr_list;
- state->fmr.end = req->fmr_list + ch->target->cmd_sg_cnt;
+ state->fmr.end = req->fmr_list + ch->target->mr_per_cmd;
for_each_sg(scat, sg, count, i) {
ret = srp_map_sg_entry(state, ch, sg, i);
@@ -1428,8 +1448,6 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch,
if (ret)
return ret;
- req->nmdesc = state->nmdesc;
-
return 0;
}
@@ -1437,15 +1455,19 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
struct srp_request *req, struct scatterlist *scat,
int count)
{
- state->desc = req->indirect_desc;
+ unsigned int sg_offset = 0;
+
state->fr.next = req->fr_list;
- state->fr.end = req->fr_list + ch->target->cmd_sg_cnt;
+ state->fr.end = req->fr_list + ch->target->mr_per_cmd;
state->sg = scat;
+ if (count == 0)
+ return 0;
+
while (count) {
int i, n;
- n = srp_map_finish_fr(state, req, ch, count);
+ n = srp_map_finish_fr(state, req, ch, count, &sg_offset);
if (unlikely(n < 0))
return n;
@@ -1454,8 +1476,6 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch,
state->sg = sg_next(state->sg);
}
- req->nmdesc = state->nmdesc;
-
return 0;
}
@@ -1468,15 +1488,12 @@ static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch,
struct scatterlist *sg;
int i;
- state->desc = req->indirect_desc;
for_each_sg(scat, sg, count, i) {
srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
ib_sg_dma_len(dev->dev, sg),
target->global_mr->rkey);
}
- req->nmdesc = state->nmdesc;
-
return 0;
}
@@ -1509,14 +1526,15 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
if (dev->use_fast_reg) {
state.sg = idb_sg;
- sg_set_buf(idb_sg, req->indirect_desc, idb_len);
+ sg_init_one(idb_sg, req->indirect_desc, idb_len);
idb_sg->dma_address = req->indirect_dma_addr; /* hack! */
#ifdef CONFIG_NEED_SG_DMA_LENGTH
idb_sg->dma_length = idb_sg->length; /* hack^2 */
#endif
- ret = srp_map_finish_fr(&state, req, ch, 1);
+ ret = srp_map_finish_fr(&state, req, ch, 1, NULL);
if (ret < 0)
return ret;
+ WARN_ON_ONCE(ret < 1);
} else if (dev->use_fmr) {
state.pages = idb_pages;
state.pages[0] = (req->indirect_dma_addr &
@@ -1534,6 +1552,41 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
return 0;
}
+#if defined(DYNAMIC_DATA_DEBUG)
+static void srp_check_mapping(struct srp_map_state *state,
+ struct srp_rdma_ch *ch, struct srp_request *req,
+ struct scatterlist *scat, int count)
+{
+ struct srp_device *dev = ch->target->srp_host->srp_dev;
+ struct srp_fr_desc **pfr;
+ u64 desc_len = 0, mr_len = 0;
+ int i;
+
+ for (i = 0; i < state->ndesc; i++)
+ desc_len += be32_to_cpu(req->indirect_desc[i].len);
+ if (dev->use_fast_reg)
+ for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++)
+ mr_len += (*pfr)->mr->length;
+ else if (dev->use_fmr)
+ for (i = 0; i < state->nmdesc; i++)
+ mr_len += be32_to_cpu(req->indirect_desc[i].len);
+ if (desc_len != scsi_bufflen(req->scmnd) ||
+ mr_len > scsi_bufflen(req->scmnd))
+ pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n",
+ scsi_bufflen(req->scmnd), desc_len, mr_len,
+ state->ndesc, state->nmdesc);
+}
+#endif
+
+/**
+ * srp_map_data() - map SCSI data buffer onto an SRP request
+ * @scmnd: SCSI command to map
+ * @ch: SRP RDMA channel
+ * @req: SRP request
+ *
+ * Returns the length in bytes of the SRP_CMD IU or a negative value if
+ * mapping failed.
+ */
static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
struct srp_request *req)
{
@@ -1600,12 +1653,25 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
target->indirect_size, DMA_TO_DEVICE);
memset(&state, 0, sizeof(state));
+ state.desc = req->indirect_desc;
if (dev->use_fast_reg)
- srp_map_sg_fr(&state, ch, req, scat, count);
+ ret = srp_map_sg_fr(&state, ch, req, scat, count);
else if (dev->use_fmr)
- srp_map_sg_fmr(&state, ch, req, scat, count);
+ ret = srp_map_sg_fmr(&state, ch, req, scat, count);
else
- srp_map_sg_dma(&state, ch, req, scat, count);
+ ret = srp_map_sg_dma(&state, ch, req, scat, count);
+ req->nmdesc = state.nmdesc;
+ if (ret < 0)
+ goto unmap;
+
+#if defined(DYNAMIC_DEBUG)
+ {
+ DEFINE_DYNAMIC_DEBUG_METADATA(ddm,
+ "Memory mapping consistency check");
+ if (unlikely(ddm.flags & _DPRINTK_FLAGS_PRINT))
+ srp_check_mapping(&state, ch, req, scat, count);
+ }
+#endif
/* We've mapped the request, now pull as much of the indirect
* descriptor table as we can into the command buffer. If this
@@ -1628,7 +1694,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
!target->allow_ext_sg)) {
shost_printk(KERN_ERR, target->scsi_host,
"Could not fit S/G list into SRP_CMD\n");
- return -EIO;
+ ret = -EIO;
+ goto unmap;
}
count = min(state.ndesc, target->cmd_sg_cnt);
@@ -1646,7 +1713,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
ret = srp_map_idb(ch, req, state.gen.next, state.gen.end,
idb_len, &idb_rkey);
if (ret < 0)
- return ret;
+ goto unmap;
req->nmdesc++;
} else {
idb_rkey = cpu_to_be32(target->global_mr->rkey);
@@ -1672,6 +1739,12 @@ map_complete:
cmd->buf_fmt = fmt;
return len;
+
+unmap:
+ srp_unmap_data(scmnd, ch, req);
+ if (ret == -ENOMEM && req->nmdesc >= target->mr_pool_size)
+ ret = -E2BIG;
+ return ret;
}
/*
@@ -2564,6 +2637,20 @@ static int srp_reset_host(struct scsi_cmnd *scmnd)
return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
}
+static int srp_slave_alloc(struct scsi_device *sdev)
+{
+ struct Scsi_Host *shost = sdev->host;
+ struct srp_target_port *target = host_to_target(shost);
+ struct srp_device *srp_dev = target->srp_host->srp_dev;
+ struct ib_device *ibdev = srp_dev->dev;
+
+ if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+ blk_queue_virt_boundary(sdev->request_queue,
+ ~srp_dev->mr_page_mask);
+
+ return 0;
+}
+
static int srp_slave_configure(struct scsi_device *sdev)
{
struct Scsi_Host *shost = sdev->host;
@@ -2755,6 +2842,7 @@ static struct scsi_host_template srp_template = {
.module = THIS_MODULE,
.name = "InfiniBand SRP initiator",
.proc_name = DRV_NAME,
+ .slave_alloc = srp_slave_alloc,
.slave_configure = srp_slave_configure,
.info = srp_target_info,
.queuecommand = srp_queuecommand,
@@ -2819,7 +2907,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
spin_unlock(&host->target_lock);
scsi_scan_target(&target->scsi_host->shost_gendev,
- 0, target->scsi_id, SCAN_WILD_CARD, 0);
+ 0, target->scsi_id, SCAN_WILD_CARD, SCSI_SCAN_INITIAL);
if (srp_connected_ch(target) < target->ch_count ||
target->qp_in_error) {
@@ -2829,7 +2917,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
goto out;
}
- pr_debug(PFX "%s: SCSI scan succeeded - detected %d LUNs\n",
+ pr_debug("%s: SCSI scan succeeded - detected %d LUNs\n",
dev_name(&target->scsi_host->shost_gendev),
srp_sdev_count(target->scsi_host));
@@ -3097,7 +3185,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
case SRP_OPT_SG_TABLESIZE:
if (match_int(args, &token) || token < 1 ||
- token > SCSI_MAX_SG_CHAIN_SEGMENTS) {
+ token > SG_MAX_SEGMENTS) {
pr_warn("bad max sg_tablesize parameter '%s'\n",
p);
goto out;
@@ -3161,6 +3249,7 @@ static ssize_t srp_create_target(struct device *dev,
struct srp_device *srp_dev = host->srp_dev;
struct ib_device *ibdev = srp_dev->dev;
int ret, node_idx, node, cpu, i;
+ unsigned int max_sectors_per_mr, mr_per_cmd = 0;
bool multich = false;
target_host = scsi_host_alloc(&srp_template,
@@ -3217,7 +3306,33 @@ static ssize_t srp_create_target(struct device *dev,
target->sg_tablesize = target->cmd_sg_cnt;
}
+ if (srp_dev->use_fast_reg || srp_dev->use_fmr) {
+ /*
+ * FR and FMR can only map one HCA page per entry. If the
+ * start address is not aligned on a HCA page boundary two
+ * entries will be used for the head and the tail although
+ * these two entries combined contain at most one HCA page of
+ * data. Hence the "+ 1" in the calculation below.
+ *
+ * The indirect data buffer descriptor is contiguous so the
+ * memory for that buffer will only be registered if
+ * register_always is true. Hence add one to mr_per_cmd if
+ * register_always has been set.
+ */
+ max_sectors_per_mr = srp_dev->max_pages_per_mr <<
+ (ilog2(srp_dev->mr_page_size) - 9);
+ mr_per_cmd = register_always +
+ (target->scsi_host->max_sectors + 1 +
+ max_sectors_per_mr - 1) / max_sectors_per_mr;
+ pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n",
+ target->scsi_host->max_sectors,
+ srp_dev->max_pages_per_mr, srp_dev->mr_page_size,
+ max_sectors_per_mr, mr_per_cmd);
+ }
+
target_host->sg_tablesize = target->sg_tablesize;
+ target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd;
+ target->mr_per_cmd = mr_per_cmd;
target->indirect_size = target->sg_tablesize *
sizeof (struct srp_direct_buf);
target->max_iu_len = sizeof (struct srp_cmd) +
@@ -3410,21 +3525,10 @@ static void srp_add_one(struct ib_device *device)
int mr_page_shift, p;
u64 max_pages_per_mr;
- srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL);
+ srp_dev = kzalloc(sizeof(*srp_dev), GFP_KERNEL);
if (!srp_dev)
return;
- srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
- device->map_phys_fmr && device->unmap_fmr);
- srp_dev->has_fr = (device->attrs.device_cap_flags &
- IB_DEVICE_MEM_MGT_EXTENSIONS);
- if (!srp_dev->has_fmr && !srp_dev->has_fr)
- dev_warn(&device->dev, "neither FMR nor FR is supported\n");
-
- srp_dev->use_fast_reg = (srp_dev->has_fr &&
- (!srp_dev->has_fmr || prefer_fr));
- srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
-
/*
* Use the smallest page size supported by the HCA, down to a
* minimum of 4096 bytes. We're unlikely to build large sglists
@@ -3435,8 +3539,25 @@ static void srp_add_one(struct ib_device *device)
srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1);
max_pages_per_mr = device->attrs.max_mr_size;
do_div(max_pages_per_mr, srp_dev->mr_page_size);
+ pr_debug("%s: %llu / %u = %llu <> %u\n", __func__,
+ device->attrs.max_mr_size, srp_dev->mr_page_size,
+ max_pages_per_mr, SRP_MAX_PAGES_PER_MR);
srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
max_pages_per_mr);
+
+ srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+ device->map_phys_fmr && device->unmap_fmr);
+ srp_dev->has_fr = (device->attrs.device_cap_flags &
+ IB_DEVICE_MEM_MGT_EXTENSIONS);
+ if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) {
+ dev_warn(&device->dev, "neither FMR nor FR is supported\n");
+ } else if (!never_register &&
+ device->attrs.max_mr_size >= 2 * srp_dev->mr_page_size) {
+ srp_dev->use_fast_reg = (srp_dev->has_fr &&
+ (!srp_dev->has_fmr || prefer_fr));
+ srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
+ }
+
if (srp_dev->use_fast_reg) {
srp_dev->max_pages_per_mr =
min_t(u32, srp_dev->max_pages_per_mr,
@@ -3456,15 +3577,14 @@ static void srp_add_one(struct ib_device *device)
if (IS_ERR(srp_dev->pd))
goto free_dev;
- if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) {
+ if (never_register || !register_always ||
+ (!srp_dev->has_fmr && !srp_dev->has_fr)) {
srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd,
IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE);
if (IS_ERR(srp_dev->global_mr))
goto err_pd;
- } else {
- srp_dev->global_mr = NULL;
}
for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 9e05ce4a04fd..26bb9b0a7a63 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -202,6 +202,8 @@ struct srp_target_port {
char target_name[32];
unsigned int scsi_id;
unsigned int sg_tablesize;
+ int mr_pool_size;
+ int mr_per_cmd;
int queue_size;
int req_ring_size;
int comp_vector;
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 8b42401d4795..883bbfe08e0e 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -254,8 +254,8 @@ static void srpt_get_class_port_info(struct ib_dm_mad *mad)
memset(cif, 0, sizeof(*cif));
cif->base_version = 1;
cif->class_version = 1;
- cif->resp_time_value = 20;
+ ib_set_cpi_resp_time(cif, 20);
mad->mad_hdr.status = 0;
}
@@ -522,6 +522,11 @@ static int srpt_refresh_port(struct srpt_port *sport)
if (ret)
goto err_query_port;
+ snprintf(sport->port_guid, sizeof(sport->port_guid),
+ "0x%016llx%016llx",
+ be64_to_cpu(sport->gid.global.subnet_prefix),
+ be64_to_cpu(sport->gid.global.interface_id));
+
if (!sport->mad_agent) {
memset(&reg_req, 0, sizeof(reg_req));
reg_req.mgmt_class = IB_MGMT_CLASS_DEVICE_MGMT;
@@ -765,52 +770,6 @@ static int srpt_post_recv(struct srpt_device *sdev,
}
/**
- * srpt_post_send() - Post an IB send request.
- *
- * Returns zero upon success and a non-zero value upon failure.
- */
-static int srpt_post_send(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx, int len)
-{
- struct ib_sge list;
- struct ib_send_wr wr, *bad_wr;
- struct srpt_device *sdev = ch->sport->sdev;
- int ret;
-
- atomic_inc(&ch->req_lim);
-
- ret = -ENOMEM;
- if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) {
- pr_warn("IB send queue full (needed 1)\n");
- goto out;
- }
-
- ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len,
- DMA_TO_DEVICE);
-
- list.addr = ioctx->ioctx.dma;
- list.length = len;
- list.lkey = sdev->pd->local_dma_lkey;
-
- ioctx->ioctx.cqe.done = srpt_send_done;
- wr.next = NULL;
- wr.wr_cqe = &ioctx->ioctx.cqe;
- wr.sg_list = &list;
- wr.num_sge = 1;
- wr.opcode = IB_WR_SEND;
- wr.send_flags = IB_SEND_SIGNALED;
-
- ret = ib_post_send(ch->qp, &wr, &bad_wr);
-
-out:
- if (ret < 0) {
- atomic_inc(&ch->sq_wr_avail);
- atomic_dec(&ch->req_lim);
- }
- return ret;
-}
-
-/**
* srpt_zerolength_write() - Perform a zero-length RDMA write.
*
* A quote from the InfiniBand specification: C9-88: For an HCA responder
@@ -843,6 +802,110 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc)
}
}
+static int srpt_alloc_rw_ctxs(struct srpt_send_ioctx *ioctx,
+ struct srp_direct_buf *db, int nbufs, struct scatterlist **sg,
+ unsigned *sg_cnt)
+{
+ enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd);
+ struct srpt_rdma_ch *ch = ioctx->ch;
+ struct scatterlist *prev = NULL;
+ unsigned prev_nents;
+ int ret, i;
+
+ if (nbufs == 1) {
+ ioctx->rw_ctxs = &ioctx->s_rw_ctx;
+ } else {
+ ioctx->rw_ctxs = kmalloc_array(nbufs, sizeof(*ioctx->rw_ctxs),
+ GFP_KERNEL);
+ if (!ioctx->rw_ctxs)
+ return -ENOMEM;
+ }
+
+ for (i = ioctx->n_rw_ctx; i < nbufs; i++, db++) {
+ struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+ u64 remote_addr = be64_to_cpu(db->va);
+ u32 size = be32_to_cpu(db->len);
+ u32 rkey = be32_to_cpu(db->key);
+
+ ret = target_alloc_sgl(&ctx->sg, &ctx->nents, size, false,
+ i < nbufs - 1);
+ if (ret)
+ goto unwind;
+
+ ret = rdma_rw_ctx_init(&ctx->rw, ch->qp, ch->sport->port,
+ ctx->sg, ctx->nents, 0, remote_addr, rkey, dir);
+ if (ret < 0) {
+ target_free_sgl(ctx->sg, ctx->nents);
+ goto unwind;
+ }
+
+ ioctx->n_rdma += ret;
+ ioctx->n_rw_ctx++;
+
+ if (prev) {
+ sg_unmark_end(&prev[prev_nents - 1]);
+ sg_chain(prev, prev_nents + 1, ctx->sg);
+ } else {
+ *sg = ctx->sg;
+ }
+
+ prev = ctx->sg;
+ prev_nents = ctx->nents;
+
+ *sg_cnt += ctx->nents;
+ }
+
+ return 0;
+
+unwind:
+ while (--i >= 0) {
+ struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+ rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port,
+ ctx->sg, ctx->nents, dir);
+ target_free_sgl(ctx->sg, ctx->nents);
+ }
+ if (ioctx->rw_ctxs != &ioctx->s_rw_ctx)
+ kfree(ioctx->rw_ctxs);
+ return ret;
+}
+
+static void srpt_free_rw_ctxs(struct srpt_rdma_ch *ch,
+ struct srpt_send_ioctx *ioctx)
+{
+ enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd);
+ int i;
+
+ for (i = 0; i < ioctx->n_rw_ctx; i++) {
+ struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+ rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port,
+ ctx->sg, ctx->nents, dir);
+ target_free_sgl(ctx->sg, ctx->nents);
+ }
+
+ if (ioctx->rw_ctxs != &ioctx->s_rw_ctx)
+ kfree(ioctx->rw_ctxs);
+}
+
+static inline void *srpt_get_desc_buf(struct srp_cmd *srp_cmd)
+{
+ /*
+ * The pointer computations below will only be compiled correctly
+ * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check
+ * whether srp_cmd::add_data has been declared as a byte pointer.
+ */
+ BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) &&
+ !__same_type(srp_cmd->add_data[0], (u8)0));
+
+ /*
+ * According to the SRP spec, the lower two bits of the 'ADDITIONAL
+ * CDB LENGTH' field are reserved and the size in bytes of this field
+ * is four times the value specified in bits 3..7. Hence the "& ~3".
+ */
+ return srp_cmd->add_data + (srp_cmd->add_cdb_len & ~3);
+}
+
/**
* srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request.
* @ioctx: Pointer to the I/O context associated with the request.
@@ -858,94 +921,59 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc)
* -ENOMEM when memory allocation fails and zero upon success.
*/
static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
- struct srp_cmd *srp_cmd,
- enum dma_data_direction *dir, u64 *data_len)
+ struct srp_cmd *srp_cmd, enum dma_data_direction *dir,
+ struct scatterlist **sg, unsigned *sg_cnt, u64 *data_len)
{
- struct srp_indirect_buf *idb;
- struct srp_direct_buf *db;
- unsigned add_cdb_offset;
- int ret;
-
- /*
- * The pointer computations below will only be compiled correctly
- * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check
- * whether srp_cmd::add_data has been declared as a byte pointer.
- */
- BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0)
- && !__same_type(srp_cmd->add_data[0], (u8)0));
-
BUG_ON(!dir);
BUG_ON(!data_len);
- ret = 0;
- *data_len = 0;
-
/*
* The lower four bits of the buffer format field contain the DATA-IN
* buffer descriptor format, and the highest four bits contain the
* DATA-OUT buffer descriptor format.
*/
- *dir = DMA_NONE;
if (srp_cmd->buf_fmt & 0xf)
/* DATA-IN: transfer data from target to initiator (read). */
*dir = DMA_FROM_DEVICE;
else if (srp_cmd->buf_fmt >> 4)
/* DATA-OUT: transfer data from initiator to target (write). */
*dir = DMA_TO_DEVICE;
+ else
+ *dir = DMA_NONE;
+
+ /* initialize data_direction early as srpt_alloc_rw_ctxs needs it */
+ ioctx->cmd.data_direction = *dir;
- /*
- * According to the SRP spec, the lower two bits of the 'ADDITIONAL
- * CDB LENGTH' field are reserved and the size in bytes of this field
- * is four times the value specified in bits 3..7. Hence the "& ~3".
- */
- add_cdb_offset = srp_cmd->add_cdb_len & ~3;
if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) ||
((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) {
- ioctx->n_rbuf = 1;
- ioctx->rbufs = &ioctx->single_rbuf;
+ struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd);
- db = (struct srp_direct_buf *)(srp_cmd->add_data
- + add_cdb_offset);
- memcpy(ioctx->rbufs, db, sizeof(*db));
*data_len = be32_to_cpu(db->len);
+ return srpt_alloc_rw_ctxs(ioctx, db, 1, sg, sg_cnt);
} else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) ||
((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) {
- idb = (struct srp_indirect_buf *)(srp_cmd->add_data
- + add_cdb_offset);
+ struct srp_indirect_buf *idb = srpt_get_desc_buf(srp_cmd);
+ int nbufs = be32_to_cpu(idb->table_desc.len) /
+ sizeof(struct srp_direct_buf);
- ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof(*db);
-
- if (ioctx->n_rbuf >
+ if (nbufs >
(srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) {
pr_err("received unsupported SRP_CMD request"
" type (%u out + %u in != %u / %zu)\n",
srp_cmd->data_out_desc_cnt,
srp_cmd->data_in_desc_cnt,
be32_to_cpu(idb->table_desc.len),
- sizeof(*db));
- ioctx->n_rbuf = 0;
- ret = -EINVAL;
- goto out;
- }
-
- if (ioctx->n_rbuf == 1)
- ioctx->rbufs = &ioctx->single_rbuf;
- else {
- ioctx->rbufs =
- kmalloc(ioctx->n_rbuf * sizeof(*db), GFP_ATOMIC);
- if (!ioctx->rbufs) {
- ioctx->n_rbuf = 0;
- ret = -ENOMEM;
- goto out;
- }
+ sizeof(struct srp_direct_buf));
+ return -EINVAL;
}
- db = idb->desc_list;
- memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof(*db));
*data_len = be32_to_cpu(idb->len);
+ return srpt_alloc_rw_ctxs(ioctx, idb->desc_list, nbufs,
+ sg, sg_cnt);
+ } else {
+ *data_len = 0;
+ return 0;
}
-out:
- return ret;
}
/**
@@ -1049,217 +1077,6 @@ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch)
}
/**
- * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list.
- */
-static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx)
-{
- struct scatterlist *sg;
- enum dma_data_direction dir;
-
- BUG_ON(!ch);
- BUG_ON(!ioctx);
- BUG_ON(ioctx->n_rdma && !ioctx->rdma_wrs);
-
- while (ioctx->n_rdma)
- kfree(ioctx->rdma_wrs[--ioctx->n_rdma].wr.sg_list);
-
- kfree(ioctx->rdma_wrs);
- ioctx->rdma_wrs = NULL;
-
- if (ioctx->mapped_sg_count) {
- sg = ioctx->sg;
- WARN_ON(!sg);
- dir = ioctx->cmd.data_direction;
- BUG_ON(dir == DMA_NONE);
- ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt,
- target_reverse_dma_direction(&ioctx->cmd));
- ioctx->mapped_sg_count = 0;
- }
-}
-
-/**
- * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list.
- */
-static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx)
-{
- struct ib_device *dev = ch->sport->sdev->device;
- struct se_cmd *cmd;
- struct scatterlist *sg, *sg_orig;
- int sg_cnt;
- enum dma_data_direction dir;
- struct ib_rdma_wr *riu;
- struct srp_direct_buf *db;
- dma_addr_t dma_addr;
- struct ib_sge *sge;
- u64 raddr;
- u32 rsize;
- u32 tsize;
- u32 dma_len;
- int count, nrdma;
- int i, j, k;
-
- BUG_ON(!ch);
- BUG_ON(!ioctx);
- cmd = &ioctx->cmd;
- dir = cmd->data_direction;
- BUG_ON(dir == DMA_NONE);
-
- ioctx->sg = sg = sg_orig = cmd->t_data_sg;
- ioctx->sg_cnt = sg_cnt = cmd->t_data_nents;
-
- count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt,
- target_reverse_dma_direction(cmd));
- if (unlikely(!count))
- return -EAGAIN;
-
- ioctx->mapped_sg_count = count;
-
- if (ioctx->rdma_wrs && ioctx->n_rdma_wrs)
- nrdma = ioctx->n_rdma_wrs;
- else {
- nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
- + ioctx->n_rbuf;
-
- ioctx->rdma_wrs = kcalloc(nrdma, sizeof(*ioctx->rdma_wrs),
- GFP_KERNEL);
- if (!ioctx->rdma_wrs)
- goto free_mem;
-
- ioctx->n_rdma_wrs = nrdma;
- }
-
- db = ioctx->rbufs;
- tsize = cmd->data_length;
- dma_len = ib_sg_dma_len(dev, &sg[0]);
- riu = ioctx->rdma_wrs;
-
- /*
- * For each remote desc - calculate the #ib_sge.
- * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then
- * each remote desc rdma_iu is required a rdma wr;
- * else
- * we need to allocate extra rdma_iu to carry extra #ib_sge in
- * another rdma wr
- */
- for (i = 0, j = 0;
- j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
- rsize = be32_to_cpu(db->len);
- raddr = be64_to_cpu(db->va);
- riu->remote_addr = raddr;
- riu->rkey = be32_to_cpu(db->key);
- riu->wr.num_sge = 0;
-
- /* calculate how many sge required for this remote_buf */
- while (rsize > 0 && tsize > 0) {
-
- if (rsize >= dma_len) {
- tsize -= dma_len;
- rsize -= dma_len;
- raddr += dma_len;
-
- if (tsize > 0) {
- ++j;
- if (j < count) {
- sg = sg_next(sg);
- dma_len = ib_sg_dma_len(
- dev, sg);
- }
- }
- } else {
- tsize -= rsize;
- dma_len -= rsize;
- rsize = 0;
- }
-
- ++riu->wr.num_sge;
-
- if (rsize > 0 &&
- riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) {
- ++ioctx->n_rdma;
- riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
- sizeof(*riu->wr.sg_list),
- GFP_KERNEL);
- if (!riu->wr.sg_list)
- goto free_mem;
-
- ++riu;
- riu->wr.num_sge = 0;
- riu->remote_addr = raddr;
- riu->rkey = be32_to_cpu(db->key);
- }
- }
-
- ++ioctx->n_rdma;
- riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
- sizeof(*riu->wr.sg_list),
- GFP_KERNEL);
- if (!riu->wr.sg_list)
- goto free_mem;
- }
-
- db = ioctx->rbufs;
- tsize = cmd->data_length;
- riu = ioctx->rdma_wrs;
- sg = sg_orig;
- dma_len = ib_sg_dma_len(dev, &sg[0]);
- dma_addr = ib_sg_dma_address(dev, &sg[0]);
-
- /* this second loop is really mapped sg_addres to rdma_iu->ib_sge */
- for (i = 0, j = 0;
- j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
- rsize = be32_to_cpu(db->len);
- sge = riu->wr.sg_list;
- k = 0;
-
- while (rsize > 0 && tsize > 0) {
- sge->addr = dma_addr;
- sge->lkey = ch->sport->sdev->pd->local_dma_lkey;
-
- if (rsize >= dma_len) {
- sge->length =
- (tsize < dma_len) ? tsize : dma_len;
- tsize -= dma_len;
- rsize -= dma_len;
-
- if (tsize > 0) {
- ++j;
- if (j < count) {
- sg = sg_next(sg);
- dma_len = ib_sg_dma_len(
- dev, sg);
- dma_addr = ib_sg_dma_address(
- dev, sg);
- }
- }
- } else {
- sge->length = (tsize < rsize) ? tsize : rsize;
- tsize -= rsize;
- dma_len -= rsize;
- dma_addr += rsize;
- rsize = 0;
- }
-
- ++k;
- if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) {
- ++riu;
- sge = riu->wr.sg_list;
- k = 0;
- } else if (rsize > 0 && tsize > 0)
- ++sge;
- }
- }
-
- return 0;
-
-free_mem:
- srpt_unmap_sg_to_ib_sge(ch, ioctx);
-
- return -ENOMEM;
-}
-
-/**
* srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator.
*/
static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
@@ -1284,12 +1101,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
BUG_ON(ioctx->ch != ch);
spin_lock_init(&ioctx->spinlock);
ioctx->state = SRPT_STATE_NEW;
- ioctx->n_rbuf = 0;
- ioctx->rbufs = NULL;
ioctx->n_rdma = 0;
- ioctx->n_rdma_wrs = 0;
- ioctx->rdma_wrs = NULL;
- ioctx->mapped_sg_count = 0;
+ ioctx->n_rw_ctx = 0;
init_completion(&ioctx->tx_done);
ioctx->queue_status_only = false;
/*
@@ -1359,7 +1172,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx)
* SRP_RSP sending failed or the SRP_RSP send completion has
* not been received in time.
*/
- srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx);
transport_generic_free_cmd(&ioctx->cmd, 0);
break;
case SRPT_STATE_MGMT_RSP_SENT:
@@ -1387,6 +1199,7 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
WARN_ON(ioctx->n_rdma <= 0);
atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+ ioctx->n_rdma = 0;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n",
@@ -1403,23 +1216,6 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
__LINE__, srpt_get_cmd_state(ioctx));
}
-static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct srpt_send_ioctx *ioctx =
- container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe);
-
- if (unlikely(wc->status != IB_WC_SUCCESS)) {
- /*
- * Note: if an RDMA write error completion is received that
- * means that a SEND also has been posted. Defer further
- * processing of the associated command until the send error
- * completion has been received.
- */
- pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n",
- ioctx, wc->status);
- }
-}
-
/**
* srpt_build_cmd_rsp() - Build an SRP_RSP response.
* @ch: RDMA channel through which the request has been received.
@@ -1537,6 +1333,8 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch,
{
struct se_cmd *cmd;
struct srp_cmd *srp_cmd;
+ struct scatterlist *sg = NULL;
+ unsigned sg_cnt = 0;
u64 data_len;
enum dma_data_direction dir;
int rc;
@@ -1563,16 +1361,21 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch,
break;
}
- if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) {
- pr_err("0x%llx: parsing SRP descriptor table failed.\n",
- srp_cmd->tag);
+ rc = srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &sg, &sg_cnt,
+ &data_len);
+ if (rc) {
+ if (rc != -EAGAIN) {
+ pr_err("0x%llx: parsing SRP descriptor table failed.\n",
+ srp_cmd->tag);
+ }
goto release_ioctx;
}
- rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb,
+ rc = target_submit_cmd_map_sgls(cmd, ch->sess, srp_cmd->cdb,
&send_ioctx->sense_data[0],
scsilun_to_int(&srp_cmd->lun), data_len,
- TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF);
+ TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF,
+ sg, sg_cnt, NULL, 0, NULL, 0);
if (rc != 0) {
pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc,
srp_cmd->tag);
@@ -1664,23 +1467,21 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch,
recv_ioctx->ioctx.dma, srp_max_req_size,
DMA_FROM_DEVICE);
- if (unlikely(ch->state == CH_CONNECTING)) {
- list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list);
- goto out;
- }
+ if (unlikely(ch->state == CH_CONNECTING))
+ goto out_wait;
if (unlikely(ch->state != CH_LIVE))
- goto out;
+ return;
srp_cmd = recv_ioctx->ioctx.buf;
if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) {
- if (!send_ioctx)
+ if (!send_ioctx) {
+ if (!list_empty(&ch->cmd_wait_list))
+ goto out_wait;
send_ioctx = srpt_get_send_ioctx(ch);
- if (unlikely(!send_ioctx)) {
- list_add_tail(&recv_ioctx->wait_list,
- &ch->cmd_wait_list);
- goto out;
}
+ if (unlikely(!send_ioctx))
+ goto out_wait;
}
switch (srp_cmd->opcode) {
@@ -1709,8 +1510,10 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch,
}
srpt_post_recv(ch->sport->sdev, recv_ioctx);
-out:
return;
+
+out_wait:
+ list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list);
}
static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -1779,14 +1582,13 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc)
WARN_ON(state != SRPT_STATE_CMD_RSP_SENT &&
state != SRPT_STATE_MGMT_RSP_SENT);
- atomic_inc(&ch->sq_wr_avail);
+ atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail);
if (wc->status != IB_WC_SUCCESS)
pr_info("sending response for ioctx 0x%p failed"
" with status %d\n", ioctx, wc->status);
if (state != SRPT_STATE_DONE) {
- srpt_unmap_sg_to_ib_sge(ch, ioctx);
transport_generic_free_cmd(&ioctx->cmd, 0);
} else {
pr_err("IB completion has been received too late for"
@@ -1804,6 +1606,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
struct ib_qp_init_attr *qp_init;
struct srpt_port *sport = ch->sport;
struct srpt_device *sdev = sport->sdev;
+ const struct ib_device_attr *attrs = &sdev->device->attrs;
u32 srp_sq_size = sport->port_attrib.srp_sq_size;
int ret;
@@ -1832,8 +1635,17 @@ retry:
qp_init->srq = sdev->srq;
qp_init->sq_sig_type = IB_SIGNAL_REQ_WR;
qp_init->qp_type = IB_QPT_RC;
- qp_init->cap.max_send_wr = srp_sq_size;
- qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE;
+ /*
+ * We divide up our send queue size into half SEND WRs to send the
+ * completions, and half R/W contexts to actually do the RDMA
+ * READ/WRITE transfers. Note that we need to allocate CQ slots for
+ * both both, as RDMA contexts will also post completions for the
+ * RDMA READ case.
+ */
+ qp_init->cap.max_send_wr = srp_sq_size / 2;
+ qp_init->cap.max_rdma_ctxs = srp_sq_size / 2;
+ qp_init->cap.max_send_sge = min(attrs->max_sge, SRPT_MAX_SG_PER_WQE);
+ qp_init->port_num = ch->sport->port;
ch->qp = ib_create_qp(sdev->pd, qp_init);
if (IS_ERR(ch->qp)) {
@@ -1960,14 +1772,6 @@ static void __srpt_close_all_ch(struct srpt_device *sdev)
}
}
-/**
- * srpt_shutdown_session() - Whether or not a session may be shut down.
- */
-static int srpt_shutdown_session(struct se_session *se_sess)
-{
- return 1;
-}
-
static void srpt_free_ch(struct kref *kref)
{
struct srpt_rdma_ch *ch = container_of(kref, struct srpt_rdma_ch, kref);
@@ -2386,95 +2190,6 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
return ret;
}
-/**
- * srpt_perform_rdmas() - Perform IB RDMA.
- *
- * Returns zero upon success or a negative number upon failure.
- */
-static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx)
-{
- struct ib_send_wr *bad_wr;
- int sq_wr_avail, ret, i;
- enum dma_data_direction dir;
- const int n_rdma = ioctx->n_rdma;
-
- dir = ioctx->cmd.data_direction;
- if (dir == DMA_TO_DEVICE) {
- /* write */
- ret = -ENOMEM;
- sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail);
- if (sq_wr_avail < 0) {
- pr_warn("IB send queue full (needed %d)\n",
- n_rdma);
- goto out;
- }
- }
-
- for (i = 0; i < n_rdma; i++) {
- struct ib_send_wr *wr = &ioctx->rdma_wrs[i].wr;
-
- wr->opcode = (dir == DMA_FROM_DEVICE) ?
- IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
-
- if (i == n_rdma - 1) {
- /* only get completion event for the last rdma read */
- if (dir == DMA_TO_DEVICE) {
- wr->send_flags = IB_SEND_SIGNALED;
- ioctx->rdma_cqe.done = srpt_rdma_read_done;
- } else {
- ioctx->rdma_cqe.done = srpt_rdma_write_done;
- }
- wr->wr_cqe = &ioctx->rdma_cqe;
- wr->next = NULL;
- } else {
- wr->wr_cqe = NULL;
- wr->next = &ioctx->rdma_wrs[i + 1].wr;
- }
- }
-
- ret = ib_post_send(ch->qp, &ioctx->rdma_wrs->wr, &bad_wr);
- if (ret)
- pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
- __func__, __LINE__, ret, i, n_rdma);
-out:
- if (unlikely(dir == DMA_TO_DEVICE && ret < 0))
- atomic_add(n_rdma, &ch->sq_wr_avail);
- return ret;
-}
-
-/**
- * srpt_xfer_data() - Start data transfer from initiator to target.
- */
-static int srpt_xfer_data(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx)
-{
- int ret;
-
- ret = srpt_map_sg_to_ib_sge(ch, ioctx);
- if (ret) {
- pr_err("%s[%d] ret=%d\n", __func__, __LINE__, ret);
- goto out;
- }
-
- ret = srpt_perform_rdmas(ch, ioctx);
- if (ret) {
- if (ret == -EAGAIN || ret == -ENOMEM)
- pr_info("%s[%d] queue full -- ret=%d\n",
- __func__, __LINE__, ret);
- else
- pr_err("%s[%d] fatal error -- ret=%d\n",
- __func__, __LINE__, ret);
- goto out_unmap;
- }
-
-out:
- return ret;
-out_unmap:
- srpt_unmap_sg_to_ib_sge(ch, ioctx);
- goto out;
-}
-
static int srpt_write_pending_status(struct se_cmd *se_cmd)
{
struct srpt_send_ioctx *ioctx;
@@ -2491,11 +2206,42 @@ static int srpt_write_pending(struct se_cmd *se_cmd)
struct srpt_send_ioctx *ioctx =
container_of(se_cmd, struct srpt_send_ioctx, cmd);
struct srpt_rdma_ch *ch = ioctx->ch;
+ struct ib_send_wr *first_wr = NULL, *bad_wr;
+ struct ib_cqe *cqe = &ioctx->rdma_cqe;
enum srpt_command_state new_state;
+ int ret, i;
new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA);
WARN_ON(new_state == SRPT_STATE_DONE);
- return srpt_xfer_data(ch, ioctx);
+
+ if (atomic_sub_return(ioctx->n_rdma, &ch->sq_wr_avail) < 0) {
+ pr_warn("%s: IB send queue full (needed %d)\n",
+ __func__, ioctx->n_rdma);
+ ret = -ENOMEM;
+ goto out_undo;
+ }
+
+ cqe->done = srpt_rdma_read_done;
+ for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) {
+ struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+ first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, ch->sport->port,
+ cqe, first_wr);
+ cqe = NULL;
+ }
+
+ ret = ib_post_send(ch->qp, first_wr, &bad_wr);
+ if (ret) {
+ pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n",
+ __func__, ret, ioctx->n_rdma,
+ atomic_read(&ch->sq_wr_avail));
+ goto out_undo;
+ }
+
+ return 0;
+out_undo:
+ atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+ return ret;
}
static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status)
@@ -2517,17 +2263,17 @@ static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status)
*/
static void srpt_queue_response(struct se_cmd *cmd)
{
- struct srpt_rdma_ch *ch;
- struct srpt_send_ioctx *ioctx;
+ struct srpt_send_ioctx *ioctx =
+ container_of(cmd, struct srpt_send_ioctx, cmd);
+ struct srpt_rdma_ch *ch = ioctx->ch;
+ struct srpt_device *sdev = ch->sport->sdev;
+ struct ib_send_wr send_wr, *first_wr = &send_wr, *bad_wr;
+ struct ib_sge sge;
enum srpt_command_state state;
unsigned long flags;
- int ret;
- enum dma_data_direction dir;
- int resp_len;
+ int resp_len, ret, i;
u8 srp_tm_status;
- ioctx = container_of(cmd, struct srpt_send_ioctx, cmd);
- ch = ioctx->ch;
BUG_ON(!ch);
spin_lock_irqsave(&ioctx->spinlock, flags);
@@ -2554,16 +2300,15 @@ static void srpt_queue_response(struct se_cmd *cmd)
return;
}
- dir = ioctx->cmd.data_direction;
-
/* For read commands, transfer the data to the initiator. */
- if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length &&
+ if (ioctx->cmd.data_direction == DMA_FROM_DEVICE &&
+ ioctx->cmd.data_length &&
!ioctx->queue_status_only) {
- ret = srpt_xfer_data(ch, ioctx);
- if (ret) {
- pr_err("xfer_data failed for tag %llu\n",
- ioctx->cmd.tag);
- return;
+ for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) {
+ struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
+
+ first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp,
+ ch->sport->port, NULL, first_wr);
}
}
@@ -2576,14 +2321,46 @@ static void srpt_queue_response(struct se_cmd *cmd)
resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status,
ioctx->cmd.tag);
}
- ret = srpt_post_send(ch, ioctx, resp_len);
- if (ret) {
- pr_err("sending cmd response failed for tag %llu\n",
- ioctx->cmd.tag);
- srpt_unmap_sg_to_ib_sge(ch, ioctx);
- srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
- target_put_sess_cmd(&ioctx->cmd);
+
+ atomic_inc(&ch->req_lim);
+
+ if (unlikely(atomic_sub_return(1 + ioctx->n_rdma,
+ &ch->sq_wr_avail) < 0)) {
+ pr_warn("%s: IB send queue full (needed %d)\n",
+ __func__, ioctx->n_rdma);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, resp_len,
+ DMA_TO_DEVICE);
+
+ sge.addr = ioctx->ioctx.dma;
+ sge.length = resp_len;
+ sge.lkey = sdev->pd->local_dma_lkey;
+
+ ioctx->ioctx.cqe.done = srpt_send_done;
+ send_wr.next = NULL;
+ send_wr.wr_cqe = &ioctx->ioctx.cqe;
+ send_wr.sg_list = &sge;
+ send_wr.num_sge = 1;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = ib_post_send(ch->qp, first_wr, &bad_wr);
+ if (ret < 0) {
+ pr_err("%s: sending cmd response failed for tag %llu (%d)\n",
+ __func__, ioctx->cmd.tag, ret);
+ goto out;
}
+
+ return;
+
+out:
+ atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail);
+ atomic_dec(&ch->req_lim);
+ srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+ target_put_sess_cmd(&ioctx->cmd);
}
static int srpt_queue_data_in(struct se_cmd *cmd)
@@ -2599,10 +2376,6 @@ static void srpt_queue_tm_rsp(struct se_cmd *cmd)
static void srpt_aborted_task(struct se_cmd *cmd)
{
- struct srpt_send_ioctx *ioctx = container_of(cmd,
- struct srpt_send_ioctx, cmd);
-
- srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx);
}
static int srpt_queue_status(struct se_cmd *cmd)
@@ -2780,10 +2553,6 @@ static void srpt_add_one(struct ib_device *device)
sdev->device->name, i);
goto err_ring;
}
- snprintf(sport->port_guid, sizeof(sport->port_guid),
- "0x%016llx%016llx",
- be64_to_cpu(sport->gid.global.subnet_prefix),
- be64_to_cpu(sport->gid.global.interface_id));
}
spin_lock(&srpt_dev_lock);
@@ -2903,12 +2672,10 @@ static void srpt_release_cmd(struct se_cmd *se_cmd)
unsigned long flags;
WARN_ON(ioctx->state != SRPT_STATE_DONE);
- WARN_ON(ioctx->mapped_sg_count != 0);
- if (ioctx->n_rbuf > 1) {
- kfree(ioctx->rbufs);
- ioctx->rbufs = NULL;
- ioctx->n_rbuf = 0;
+ if (ioctx->n_rw_ctx) {
+ srpt_free_rw_ctxs(ch, ioctx);
+ ioctx->n_rw_ctx = 0;
}
spin_lock_irqsave(&ch->spinlock, flags);
@@ -3287,7 +3054,6 @@ static const struct target_core_fabric_ops srpt_template = {
.tpg_get_inst_index = srpt_tpg_get_inst_index,
.release_cmd = srpt_release_cmd,
.check_stop_free = srpt_check_stop_free,
- .shutdown_session = srpt_shutdown_session,
.close_session = srpt_close_session,
.sess_get_index = srpt_sess_get_index,
.sess_get_initiator_sid = NULL,
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
index af9b8b527340..581878782854 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.h
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -42,6 +42,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_sa.h>
#include <rdma/ib_cm.h>
+#include <rdma/rw.h>
#include <scsi/srp.h>
@@ -105,7 +106,11 @@ enum {
SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2,
SRPT_DEF_SG_TABLESIZE = 128,
- SRPT_DEF_SG_PER_WQE = 16,
+ /*
+ * An experimentally determined value that avoids that QP creation
+ * fails due to "swiotlb buffer is full" on systems using the swiotlb.
+ */
+ SRPT_MAX_SG_PER_WQE = 16,
MIN_SRPT_SQ_SIZE = 16,
DEF_SRPT_SQ_SIZE = 4096,
@@ -174,21 +179,17 @@ struct srpt_recv_ioctx {
struct srpt_ioctx ioctx;
struct list_head wait_list;
};
+
+struct srpt_rw_ctx {
+ struct rdma_rw_ctx rw;
+ struct scatterlist *sg;
+ unsigned int nents;
+};
/**
* struct srpt_send_ioctx - SRPT send I/O context.
* @ioctx: See above.
* @ch: Channel pointer.
- * @free_list: Node in srpt_rdma_ch.free_list.
- * @n_rbuf: Number of data buffers in the received SRP command.
- * @rbufs: Pointer to SRP data buffer array.
- * @single_rbuf: SRP data buffer if the command has only a single buffer.
- * @sg: Pointer to sg-list associated with this I/O context.
- * @sg_cnt: SG-list size.
- * @mapped_sg_count: ib_dma_map_sg() return value.
- * @n_rdma_wrs: Number of elements in the rdma_wrs array.
- * @rdma_wrs: Array with information about the RDMA mapping.
- * @tag: Tag of the received SRP information unit.
* @spinlock: Protects 'state'.
* @state: I/O context state.
* @cmd: Target core command data structure.
@@ -197,21 +198,18 @@ struct srpt_recv_ioctx {
struct srpt_send_ioctx {
struct srpt_ioctx ioctx;
struct srpt_rdma_ch *ch;
- struct ib_rdma_wr *rdma_wrs;
+
+ struct srpt_rw_ctx s_rw_ctx;
+ struct srpt_rw_ctx *rw_ctxs;
+
struct ib_cqe rdma_cqe;
- struct srp_direct_buf *rbufs;
- struct srp_direct_buf single_rbuf;
- struct scatterlist *sg;
struct list_head free_list;
spinlock_t spinlock;
enum srpt_command_state state;
struct se_cmd cmd;
struct completion tx_done;
- int sg_cnt;
- int mapped_sg_count;
- u16 n_rdma_wrs;
u8 n_rdma;
- u8 n_rbuf;
+ u8 n_rw_ctx;
bool queue_status_only;
u8 sense_data[TRANSPORT_SENSE_BUFFER];
};